35 files changed, 1504 insertions, 489 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ae55c1e04d10..e742d06285b7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -263,8 +263,14 @@ config ZONE_DMA_FLAG
        default "1"
 config BOUNCE
-        def_bool y
+        bool "Enable bounce buffers"
+        default y
        depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
+        help
+          Enable bounce buffers for devices that cannot access
+          the full range of memory available to the CPU. Enabled
+          by default when ZONE_DMA or HIGHMEM is selected, but you
+          may say n to override this.
 # On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often
 # have more than 4GB of memory, but we don't currently use the IOTLB to present
@@ -286,8 +292,12 @@ config NR_QUICK
        default "1"
 config VIRT_TO_BUS
-        def_bool y
+        bool
-        depends on HAVE_VIRT_TO_BUS
+        help
+          An architecture should select this if it implements the
+          deprecated interface virt_to_bus().  All new architectures
+          should probably not select this.
 config MMU_NOTIFIER
        bool
diff --git a/mm/Makefile b/mm/Makefile
index 3a4628751f89..72c5acb9345f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -50,7 +50,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
-obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/bounce.c b/mm/bounce.c
index 5f8901768602..a5c2ec3589cb 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -181,32 +181,13 @@ static void bounce_end_io_read_isa(struct bio *bio, int err)
 #ifdef CONFIG_NEED_BOUNCE_POOL
 static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
 {
-        struct page *page;
-        struct backing_dev_info *bdi;
-        struct address_space *mapping;
-        struct bio_vec *from;
-        int i;
        if (bio_data_dir(bio) != WRITE)
                return 0;
        if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
                return 0;
-        /*
+        return test_bit(BIO_SNAP_STABLE, &bio->bi_flags);
-         * Based on the first page that has a valid mapping, decide whether or
-         * not we have to employ bounce buffering to guarantee stable pages.
-         */
-        bio_for_each_segment(from, bio, i) {
-                page = from->bv_page;
-                mapping = page_mapping(page);
-                if (!mapping)
-                        continue;
-                bdi = mapping->backing_dev_info;
-                return mapping->host->i_sb->s_flags & MS_SNAP_STABLE;
-        }
-        return 0;
 }
 #else
 static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
diff --git a/mm/filemap.c b/mm/filemap.c
index e1979fdca805..e989fb1eaa72 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -35,6 +35,9 @@
 #include <linux/cleancache.h>
 #include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/filemap.h>
 /*
 * FIXME: remove all knowledge of the buffer layer from the core VM
 */
@@ -113,6 +116,7 @@ void __delete_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
+        trace_mm_filemap_delete_from_page_cache(page);
        /*
         * if we're uptodate, flush out into the cleancache, otherwise
         * invalidate any existing cleancache entries.  We can't leave
@@ -184,6 +188,17 @@ static int sleep_on_page_killable(void *word)
        return fatal_signal_pending(current) ? -EINTR : 0;
 }
+static int filemap_check_errors(struct address_space *mapping)
+{
+        int ret = 0;
+        /* Check for outstanding write errors */
+        if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+                ret = -ENOSPC;
+        if (test_and_clear_bit(AS_EIO, &mapping->flags))
+                ret = -EIO;
+        return ret;
+}
 /**
 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
 * @mapping:    address space structure to write
@@ -265,10 +280,10 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
        pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
        struct pagevec pvec;
        int nr_pages;
-        int ret = 0;
+        int ret2, ret = 0;
        if (end_byte < start_byte)
-                return 0;
+                goto out;
        pagevec_init(&pvec, 0);
        while ((index <= end) &&
@@ -291,12 +306,10 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
                pagevec_release(&pvec);
                cond_resched();
        }
+out:
-        /* Check for outstanding write errors */
+        ret2 = filemap_check_errors(mapping);
-        if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+        if (!ret)
-                ret = -ENOSPC;
+                ret = ret2;
-        if (test_and_clear_bit(AS_EIO, &mapping->flags))
-                ret = -EIO;
        return ret;
 }
@@ -337,6 +350,8 @@ int filemap_write_and_wait(struct address_space *mapping)
                        if (!err)
                                err = err2;
                }
+        } else {
+                err = filemap_check_errors(mapping);
        }
        return err;
 }
@@ -368,6 +383,8 @@ int filemap_write_and_wait_range(struct address_space *mapping,
                        if (!err)
                                err = err2;
                }
+        } else {
+                err = filemap_check_errors(mapping);
        }
        return err;
 }
@@ -464,6 +481,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                        mapping->nrpages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
                        spin_unlock_irq(&mapping->tree_lock);
+                        trace_mm_filemap_add_to_page_cache(page);
                } else {
                        page->mapping = NULL;
                        /* Leave page->index set: truncation relies upon it */
diff --git a/mm/fremap.c b/mm/fremap.c
index 0cd4c11488ed..87da3590c61e 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -129,7 +129,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        struct vm_area_struct *vma;
        int err = -EINVAL;
        int has_write_lock = 0;
-        vm_flags_t vm_flags;
+        vm_flags_t vm_flags = 0;
        if (prot)
                return err;
@@ -204,10 +204,8 @@ get_write_lock:
                        unsigned long addr;
                        struct file *file = get_file(vma->vm_file);
-                        vm_flags = vma->vm_flags;
+                        addr = mmap_region(file, start, size,
-                        if (!(flags & MAP_NONBLOCK))
+                                        vma->vm_flags, pgoff);
-                                vm_flags |= VM_POPULATE;
-                        addr = mmap_region(file, start, size, vm_flags, pgoff);
                        fput(file);
                        if (IS_ERR_VALUE(addr)) {
                                err = addr;
@@ -226,12 +224,6 @@ get_write_lock:
                mutex_unlock(&mapping->i_mmap_mutex);
        }
-        if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) {
-                if (!has_write_lock)
-                        goto get_write_lock;
-                vma->vm_flags |= VM_POPULATE;
-        }
        if (vma->vm_flags & VM_LOCKED) {
                /*
                 * drop PG_Mlocked flag for over-mapped range
@@ -254,7 +246,8 @@ get_write_lock:
         */
 out:
-        vm_flags = vma->vm_flags;
+        if (vma)
+                vm_flags = vma->vm_flags;
        if (likely(!has_write_lock))
                up_read(&mm->mmap_sem);
        else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e2f7f5aaaafb..03a89a2f464b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -163,35 +163,34 @@ static int start_khugepaged(void)
 }
 static atomic_t huge_zero_refcount;
-static unsigned long huge_zero_pfn __read_mostly;
+static struct page *huge_zero_page __read_mostly;
-static inline bool is_huge_zero_pfn(unsigned long pfn)
+static inline bool is_huge_zero_page(struct page *page)
 {
-        unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
+        return ACCESS_ONCE(huge_zero_page) == page;
-        return zero_pfn && pfn == zero_pfn;
 }
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
-        return is_huge_zero_pfn(pmd_pfn(pmd));
+        return is_huge_zero_page(pmd_page(pmd));
 }
-static unsigned long get_huge_zero_page(void)
+static struct page *get_huge_zero_page(void)
 {
        struct page *zero_page;
 retry:
        if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
-                return ACCESS_ONCE(huge_zero_pfn);
+                return ACCESS_ONCE(huge_zero_page);
        zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
                        HPAGE_PMD_ORDER);
        if (!zero_page) {
                count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
-                return 0;
+                return NULL;
        }
        count_vm_event(THP_ZERO_PAGE_ALLOC);
        preempt_disable();
-        if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
+        if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
                preempt_enable();
                __free_page(zero_page);
                goto retry;
@@ -200,7 +199,7 @@ retry:
        /* We take additional reference here. It will be put back by shrinker */
        atomic_set(&huge_zero_refcount, 2);
        preempt_enable();
-        return ACCESS_ONCE(huge_zero_pfn);
+        return ACCESS_ONCE(huge_zero_page);
 }
 static void put_huge_zero_page(void)
@@ -220,9 +219,9 @@ static int shrink_huge_zero_page(struct shrinker *shrink,
                return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
        if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
-                unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
+                struct page *zero_page = xchg(&huge_zero_page, NULL);
-                BUG_ON(zero_pfn == 0);
+                BUG_ON(zero_page == NULL);
-                __free_page(__pfn_to_page(zero_pfn));
+                __free_page(zero_page);
        }
        return 0;
@@ -713,6 +712,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                return VM_FAULT_OOM;
        clear_huge_page(page, haddr, HPAGE_PMD_NR);
+        /*
+         * The memory barrier inside __SetPageUptodate makes sure that
+         * clear_huge_page writes become visible before the set_pmd_at()
+         * write.
+         */
        __SetPageUptodate(page);
        spin_lock(&mm->page_table_lock);
@@ -724,12 +728,6 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
        } else {
                pmd_t entry;
                entry = mk_huge_pmd(page, vma);
-                /*
-                 * The spinlocking to take the lru_lock inside
-                 * page_add_new_anon_rmap() acts as a full memory
-                 * barrier to be sure clear_huge_page writes become
-                 * visible after the set_pmd_at() write.
-                 */
                page_add_new_anon_rmap(page, vma, haddr);
                set_pmd_at(mm, haddr, pmd, entry);
                pgtable_trans_huge_deposit(mm, pgtable);
@@ -765,12 +763,12 @@ static inline struct page *alloc_hugepage(int defrag)
 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
-                unsigned long zero_pfn)
+                struct page *zero_page)
 {
        pmd_t entry;
        if (!pmd_none(*pmd))
                return false;
-        entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
+        entry = mk_pmd(zero_page, vma->vm_page_prot);
        entry = pmd_wrprotect(entry);
        entry = pmd_mkhuge(entry);
        set_pmd_at(mm, haddr, pmd, entry);
@@ -795,20 +793,20 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                if (!(flags & FAULT_FLAG_WRITE) &&
                                transparent_hugepage_use_zero_page()) {
                        pgtable_t pgtable;
-                        unsigned long zero_pfn;
+                        struct page *zero_page;
                        bool set;
                        pgtable = pte_alloc_one(mm, haddr);
                        if (unlikely(!pgtable))
                                return VM_FAULT_OOM;
-                        zero_pfn = get_huge_zero_page();
+                        zero_page = get_huge_zero_page();
-                        if (unlikely(!zero_pfn)) {
+                        if (unlikely(!zero_page)) {
                                pte_free(mm, pgtable);
                                count_vm_event(THP_FAULT_FALLBACK);
                                goto out;
                        }
                        spin_lock(&mm->page_table_lock);
                        set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
-                                        zero_pfn);
+                                        zero_page);
                        spin_unlock(&mm->page_table_lock);
                        if (!set) {
                                pte_free(mm, pgtable);
@@ -887,16 +885,16 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         * a page table.
         */
        if (is_huge_zero_pmd(pmd)) {
-                unsigned long zero_pfn;
+                struct page *zero_page;
                bool set;
                /*
                 * get_huge_zero_page() will never allocate a new page here,
                 * since we already have a zero page to copy. It just takes a
                 * reference.
                 */
-                zero_pfn = get_huge_zero_page();
+                zero_page = get_huge_zero_page();
                set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
-                                zero_pfn);
+                                zero_page);
                BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
                ret = 0;
                goto out_unlock;
@@ -1560,7 +1558,8 @@ static int __split_huge_page_splitting(struct page *page,
        return ret;
 }
-static void __split_huge_page_refcount(struct page *page)
+static void __split_huge_page_refcount(struct page *page,
+                                       struct list_head *list)
 {
        int i;
        struct zone *zone = page_zone(page);
@@ -1646,7 +1645,7 @@ static void __split_huge_page_refcount(struct page *page)
                BUG_ON(!PageDirty(page_tail));
                BUG_ON(!PageSwapBacked(page_tail));
-                lru_add_page_tail(page, page_tail, lruvec);
+                lru_add_page_tail(page, page_tail, lruvec, list);
        }
        atomic_sub(tail_count, &page->_count);
        BUG_ON(atomic_read(&page->_count) <= 0);
@@ -1753,7 +1752,8 @@ static int __split_huge_page_map(struct page *page,
 /* must be called with anon_vma->root->rwsem held */
 static void __split_huge_page(struct page *page,
-                              struct anon_vma *anon_vma)
+                              struct anon_vma *anon_vma,
+                              struct list_head *list)
 {
        int mapcount, mapcount2;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -1784,7 +1784,7 @@ static void __split_huge_page(struct page *page,
                       mapcount, page_mapcount(page));
        BUG_ON(mapcount != page_mapcount(page));
-        __split_huge_page_refcount(page);
+        __split_huge_page_refcount(page, list);
        mapcount2 = 0;
        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
@@ -1799,12 +1799,19 @@ static void __split_huge_page(struct page *page,
        BUG_ON(mapcount != mapcount2);
 }
-int split_huge_page(struct page *page)
+/*
+ * Split a hugepage into normal pages. This doesn't change the position of head
+ * page. If @list is null, tail pages will be added to LRU list, otherwise, to
+ * @list. Both head page and tail pages will inherit mapping, flags, and so on
+ * from the hugepage.
+ * Return 0 if the hugepage is split successfully otherwise return 1.
+ */
+int split_huge_page_to_list(struct page *page, struct list_head *list)
 {
        struct anon_vma *anon_vma;
        int ret = 1;
-        BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
+        BUG_ON(is_huge_zero_page(page));
        BUG_ON(!PageAnon(page));
        /*
@@ -1824,7 +1831,7 @@ int split_huge_page(struct page *page)
                goto out_unlock;
        BUG_ON(!PageSwapBacked(page));
-        __split_huge_page(page, anon_vma);
+        __split_huge_page(page, anon_vma, list);
        count_vm_event(THP_SPLIT);
        BUG_ON(PageCompound(page));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0a0be33bb199..f8feeeca6686 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1761,7 +1761,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
 * Unregister hstate attributes from a single node device.
 * No-op if no hstate attributes attached.
 */
-void hugetlb_unregister_node(struct node *node)
+static void hugetlb_unregister_node(struct node *node)
 {
        struct hstate *h;
        struct node_hstate *nhs = &node_hstates[node->dev.id];
@@ -1805,7 +1805,7 @@ static void hugetlb_unregister_all_nodes(void)
 * Register hstate attributes for a single node device.
 * No-op if attributes already registered.
 */
-void hugetlb_register_node(struct node *node)
+static void hugetlb_register_node(struct node *node)
 {
        struct hstate *h;
        struct node_hstate *nhs = &node_hstates[node->dev.id];
@@ -2121,11 +2121,30 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
                nid, h->surplus_huge_pages_node[nid]);
 }
+void hugetlb_show_meminfo(void)
+{
+        struct hstate *h;
+        int nid;
+        for_each_node_state(nid, N_MEMORY)
+                for_each_hstate(h)
+                        pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
+                                nid,
+                                h->nr_huge_pages_node[nid],
+                                h->free_huge_pages_node[nid],
+                                h->surplus_huge_pages_node[nid],
+                                1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
+}
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
-        struct hstate *h = &default_hstate;
+        struct hstate *h;
-        return h->nr_huge_pages * pages_per_huge_page(h);
+        unsigned long nr_total_pages = 0;
+        for_each_hstate(h)
+                nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
+        return nr_total_pages;
 }
 static int hugetlb_acct_memory(struct hstate *h, long delta)
@@ -2243,10 +2262,11 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
        pte_t entry;
        if (writable) {
-                entry =
+                entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
-                    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+                                         vma->vm_page_prot)));
        } else {
-                entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+                entry = huge_pte_wrprotect(mk_huge_pte(page,
+                                           vma->vm_page_prot));
        }
        entry = pte_mkyoung(entry);
        entry = pte_mkhuge(entry);
@@ -2260,7 +2280,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
 {
        pte_t entry;
-        entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
+        entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
        if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
                update_mmu_cache(vma, address, ptep);
 }
@@ -2375,7 +2395,7 @@ again:
                 * HWPoisoned hugepage is already unmapped and dropped reference
                 */
                if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
-                        pte_clear(mm, address, ptep);
+                        huge_pte_clear(mm, address, ptep);
                        continue;
                }
@@ -2399,7 +2419,7 @@ again:
                pte = huge_ptep_get_and_clear(mm, address, ptep);
                tlb_remove_tlb_entry(tlb, ptep, address);
-                if (pte_dirty(pte))
+                if (huge_pte_dirty(pte))
                        set_page_dirty(page);
                page_remove_rmap(page);
@@ -2852,7 +2872,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * page now as it is used to determine if a reservation has been
         * consumed.
         */
-        if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) {
+        if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
                if (vma_needs_reservation(h, vma, address) < 0) {
                        ret = VM_FAULT_OOM;
                        goto out_mutex;
@@ -2882,12 +2902,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (flags & FAULT_FLAG_WRITE) {
-                if (!pte_write(entry)) {
+                if (!huge_pte_write(entry)) {
                        ret = hugetlb_cow(mm, vma, address, ptep, entry,
                                                        pagecache_page);
                        goto out_page_table_lock;
                }
-                entry = pte_mkdirty(entry);
+                entry = huge_pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
        if (huge_ptep_set_access_flags(vma, address, ptep, entry,
@@ -2957,8 +2977,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        break;
                }
-                if (absent ||
+                /*
-                    ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
+                 * We need call hugetlb_fault for both hugepages under migration
+                 * (in which case hugetlb_fault waits for the migration,) and
+                 * hwpoisoned hugepages (in which case we need to prevent the
+                 * caller from accessing to them.) In order to do this, we use
+                 * here is_swap_pte instead of is_hugetlb_entry_migration and
+                 * is_hugetlb_entry_hwpoisoned. This is because it simply covers
+                 * both cases, and because we can't follow correct pages
+                 * directly from any kind of swap entries.
+                 */
+                if (absent || is_swap_pte(huge_ptep_get(pte)) ||
+                    ((flags & FOLL_WRITE) &&
+                      !huge_pte_write(huge_ptep_get(pte)))) {
                        int ret;
                        spin_unlock(&mm->page_table_lock);
@@ -3028,7 +3059,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                }
                if (!huge_pte_none(huge_ptep_get(ptep))) {
                        pte = huge_ptep_get_and_clear(mm, address, ptep);
-                        pte = pte_mkhuge(pte_modify(pte, newprot));
+                        pte = pte_mkhuge(huge_pte_modify(pte, newprot));
                        pte = arch_make_huge_pte(pte, vma, NULL, 0);
                        set_huge_pte_at(mm, address, ptep, pte);
                        pages++;
diff --git a/mm/madvise.c b/mm/madvise.c
index c58c94b56c3d..7055883e6e25 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -473,27 +473,27 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
        if (!madvise_behavior_valid(behavior))
                return error;
-        write = madvise_need_mmap_write(behavior);
-        if (write)
-                down_write(&current->mm->mmap_sem);
-        else
-                down_read(&current->mm->mmap_sem);
        if (start & ~PAGE_MASK)
-                goto out;
+                return error;
        len = (len_in + ~PAGE_MASK) & PAGE_MASK;
        /* Check to see whether len was rounded up from small -ve to zero */
        if (len_in && !len)
-                goto out;
+                return error;
        end = start + len;
        if (end < start)
-                goto out;
+                return error;
        error = 0;
        if (end == start)
-                goto out;
+                return error;
+        write = madvise_need_mmap_write(behavior);
+        if (write)
+                down_write(&current->mm->mmap_sem);
+        else
+                down_read(&current->mm->mmap_sem);
        /*
         * If the interval [start,end) covers some unmapped address
@@ -509,14 +509,14 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
                /* Still start < end. */
                error = -ENOMEM;
                if (!vma)
-                        goto out_plug;
+                        goto out;
                /* Here start < (end|vma->vm_end). */
                if (start < vma->vm_start) {
                        unmapped_error = -ENOMEM;
                        start = vma->vm_start;
                        if (start >= end)
-                                goto out_plug;
+                                goto out;
                }
                /* Here vma->vm_start <= start < (end|vma->vm_end) */
@@ -527,21 +527,20 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
                /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
                error = madvise_vma(vma, &prev, start, tmp, behavior);
                if (error)
-                        goto out_plug;
+                        goto out;
                start = tmp;
                if (prev && start < prev->vm_end)
                        start = prev->vm_end;
                error = unmapped_error;
                if (start >= end)
-                        goto out_plug;
+                        goto out;
                if (prev)
                        vma = prev->vm_next;
                else    /* madvise_remove dropped mmap_sem */
                        vma = find_vma(current->mm, start);
        }
-out_plug:
-        blk_finish_plug(&plug);
 out:
+        blk_finish_plug(&plug);
        if (write)
                up_write(&current->mm->mmap_sem);
        else
diff --git a/mm/memblock.c b/mm/memblock.c
index b8d9147e5c08..c5fad932fa51 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -322,10 +322,11 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
 /**
 * memblock_insert_region - insert new memblock region
- * @type: memblock type to insert into
+ * @type:       memblock type to insert into
- * @idx: index for the insertion point
+ * @idx:        index for the insertion point
- * @base: base address of the new region
+ * @base:       base address of the new region
- * @size: size of the new region
+ * @size:       size of the new region
+ * @nid:        node id of the new region
 *
 * Insert new memblock region [@base,@base+@size) into @type at @idx.
 * @type must already have extra room to accomodate the new region.
@@ -771,6 +772,9 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
 {
        phys_addr_t found;
+        if (WARN_ON(!align))
+                align = __alignof__(long long);
        /* align @size to avoid excessive fragmentation on reserved array */
        size = round_up(size, align);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2b552224f5cf..0f1d92163f30 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -49,6 +49,7 @@
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
+#include <linux/vmpressure.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
@@ -152,8 +153,13 @@ struct mem_cgroup_stat_cpu {
 };
 struct mem_cgroup_reclaim_iter {
-        /* css_id of the last scanned hierarchy member */
+        /*
-        int position;
+         * last scanned hierarchy member. Valid only if last_dead_count
+         * matches memcg->dead_count of the hierarchy root group.
+         */
+        struct mem_cgroup *last_visited;
+        unsigned long last_dead_count;
        /* scan generation, increased every round-trip */
        unsigned int generation;
 };
@@ -256,6 +262,9 @@ struct mem_cgroup {
         */
        struct res_counter res;
+        /* vmpressure notifications */
+        struct vmpressure vmpressure;
        union {
                /*
                 * the counter to account for mem+swap usage.
@@ -335,6 +344,7 @@ struct mem_cgroup {
        struct mem_cgroup_stat_cpu nocpu_base;
        spinlock_t pcp_counter_lock;
+        atomic_t        dead_count;
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
        struct tcp_memcontrol tcp_mem;
 #endif
@@ -353,6 +363,7 @@ struct mem_cgroup {
        atomic_t        numainfo_events;
        atomic_t        numainfo_updating;
 #endif
        /*
         * Per cgroup active and inactive list, similar to the
         * per zone LRU lists.
@@ -504,6 +515,24 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
        return container_of(s, struct mem_cgroup, css);
 }
+/* Some nice accessors for the vmpressure. */
+struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
+{
+        if (!memcg)
+                memcg = root_mem_cgroup;
+        return &memcg->vmpressure;
+}
+struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
+{
+        return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
+}
+struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
+{
+        return &mem_cgroup_from_css(css)->vmpressure;
+}
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
        return (memcg == root_mem_cgroup);
@@ -1067,6 +1096,51 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
        return memcg;
 }
+/*
+ * Returns a next (in a pre-order walk) alive memcg (with elevated css
+ * ref. count) or NULL if the whole root's subtree has been visited.
+ *
+ * helper function to be used by mem_cgroup_iter
+ */
+static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
+                struct mem_cgroup *last_visited)
+{
+        struct cgroup *prev_cgroup, *next_cgroup;
+        /*
+         * Root is not visited by cgroup iterators so it needs an
+         * explicit visit.
+         */
+        if (!last_visited)
+                return root;
+        prev_cgroup = (last_visited == root) ? NULL
+                : last_visited->css.cgroup;
+skip_node:
+        next_cgroup = cgroup_next_descendant_pre(
+                        prev_cgroup, root->css.cgroup);
+        /*
+         * Even if we found a group we have to make sure it is
+         * alive. css && !memcg means that the groups should be
+         * skipped and we should continue the tree walk.
+         * last_visited css is safe to use because it is
+         * protected by css_get and the tree walk is rcu safe.
+         */
+        if (next_cgroup) {
+                struct mem_cgroup *mem = mem_cgroup_from_cont(
+                                next_cgroup);
+                if (css_tryget(&mem->css))
+                        return mem;
+                else {
+                        prev_cgroup = next_cgroup;
+                        goto skip_node;
+                }
+        }
+        return NULL;
+}
 /**
 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 * @root: hierarchy root
@@ -1089,7 +1163,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                                   struct mem_cgroup_reclaim_cookie *reclaim)
 {
        struct mem_cgroup *memcg = NULL;
-        int id = 0;
+        struct mem_cgroup *last_visited = NULL;
+        unsigned long uninitialized_var(dead_count);
        if (mem_cgroup_disabled())
                return NULL;
@@ -1098,20 +1173,17 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                root = root_mem_cgroup;
        if (prev && !reclaim)
-                id = css_id(&prev->css);
+                last_visited = prev;
-        if (prev && prev != root)
-                css_put(&prev->css);
        if (!root->use_hierarchy && root != root_mem_cgroup) {
                if (prev)
-                        return NULL;
+                        goto out_css_put;
                return root;
        }
+        rcu_read_lock();
        while (!memcg) {
                struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
-                struct cgroup_subsys_state *css;
                if (reclaim) {
                        int nid = zone_to_nid(reclaim->zone);
@@ -1120,31 +1192,60 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                        mz = mem_cgroup_zoneinfo(root, nid, zid);
                        iter = &mz->reclaim_iter[reclaim->priority];
-                        if (prev && reclaim->generation != iter->generation)
+                        last_visited = iter->last_visited;
-                                return NULL;
+                        if (prev && reclaim->generation != iter->generation) {
-                        id = iter->position;
+                                iter->last_visited = NULL;
+                                goto out_unlock;
+                        }
+                        /*
+                         * If the dead_count mismatches, a destruction
+                         * has happened or is happening concurrently.
+                         * If the dead_count matches, a destruction
+                         * might still happen concurrently, but since
+                         * we checked under RCU, that destruction
+                         * won't free the object until we release the
+                         * RCU reader lock.  Thus, the dead_count
+                         * check verifies the pointer is still valid,
+                         * css_tryget() verifies the cgroup pointed to
+                         * is alive.
+                         */
+                        dead_count = atomic_read(&root->dead_count);
+                        smp_rmb();
+                        last_visited = iter->last_visited;
+                        if (last_visited) {
+                                if ((dead_count != iter->last_dead_count) ||
+                                        !css_tryget(&last_visited->css)) {
+                                        last_visited = NULL;
+                                }
+                        }
                }
-                rcu_read_lock();
+                memcg = __mem_cgroup_iter_next(root, last_visited);
-                css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
-                if (css) {
-                        if (css == &root->css || css_tryget(css))
-                                memcg = mem_cgroup_from_css(css);
-                } else
-                        id = 0;
-                rcu_read_unlock();
                if (reclaim) {
-                        iter->position = id;
+                        if (last_visited)
-                        if (!css)
+                                css_put(&last_visited->css);
+                        iter->last_visited = memcg;
+                        smp_wmb();
+                        iter->last_dead_count = dead_count;
+                        if (!memcg)
                                iter->generation++;
                        else if (!prev && memcg)
                                reclaim->generation = iter->generation;
                }
-                if (prev && !css)
+                if (prev && !memcg)
-                        return NULL;
+                        goto out_unlock;
        }
+out_unlock:
+        rcu_read_unlock();
+out_css_put:
+        if (prev && prev != root)
+                css_put(&prev->css);
        return memcg;
 }
@@ -1686,11 +1787,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
        struct task_struct *chosen = NULL;
        /*
-         * If current has a pending SIGKILL, then automatically select it.  The
+         * If current has a pending SIGKILL or is exiting, then automatically
-         * goal is to allow it to allocate so that it may quickly exit and free
+         * select it.  The goal is to allow it to allocate so that it may
-         * its memory.
+         * quickly exit and free its memory.
         */
-        if (fatal_signal_pending(current)) {
+        if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
                set_thread_flag(TIF_MEMDIE);
                return;
        }
@@ -3114,12 +3215,12 @@ void memcg_release_cache(struct kmem_cache *s)
        root = s->memcg_params->root_cache;
        root->memcg_params->memcg_caches[id] = NULL;
-        mem_cgroup_put(memcg);
        mutex_lock(&memcg->slab_caches_mutex);
        list_del(&s->memcg_params->list);
        mutex_unlock(&memcg->slab_caches_mutex);
+        mem_cgroup_put(memcg);
 out:
        kfree(s->memcg_params);
 }
@@ -3220,52 +3321,53 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
        schedule_work(&cachep->memcg_params->destroy);
 }
-static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
+/*
-{
+ * This lock protects updaters, not readers. We want readers to be as fast as
-        char *name;
+ * they can, and they will either see NULL or a valid cache value. Our model
-        struct dentry *dentry;
+ * allow them to see NULL, in which case the root memcg will be selected.
+ *
-        rcu_read_lock();
+ * We need this lock because multiple allocations to the same cache from a non
-        dentry = rcu_dereference(memcg->css.cgroup->dentry);
+ * will span more than one worker. Only one of them can create the cache.
-        rcu_read_unlock();
+ */
+static DEFINE_MUTEX(memcg_cache_mutex);
-        BUG_ON(dentry == NULL);
-        name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
-                         memcg_cache_id(memcg), dentry->d_name.name);
-        return name;
-}
+/*
+ * Called with memcg_cache_mutex held
+ */
 static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
                                         struct kmem_cache *s)
 {
-        char *name;
        struct kmem_cache *new;
+        static char *tmp_name = NULL;
-        name = memcg_cache_name(memcg, s);
+        lockdep_assert_held(&memcg_cache_mutex);
-        if (!name)
-                return NULL;
+        /*
+         * kmem_cache_create_memcg duplicates the given name and
+         * cgroup_name for this name requires RCU context.
+         * This static temporary buffer is used to prevent from
+         * pointless shortliving allocation.
+         */
+        if (!tmp_name) {
+                tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
+                if (!tmp_name)
+                        return NULL;
+        }
+        rcu_read_lock();
+        snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
+                         memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
+        rcu_read_unlock();
-        new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
+        new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
                                      (s->flags & ~SLAB_PANIC), s->ctor, s);
        if (new)
                new->allocflags |= __GFP_KMEMCG;
-        kfree(name);
        return new;
 }
-/*
- * This lock protects updaters, not readers. We want readers to be as fast as
- * they can, and they will either see NULL or a valid cache value. Our model
- * allow them to see NULL, in which case the root memcg will be selected.
- *
- * We need this lock because multiple allocations to the same cache from a non
- * will span more than one worker. Only one of them can create the cache.
- */
-static DEFINE_MUTEX(memcg_cache_mutex);
 static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
                                                  struct kmem_cache *cachep)
 {
@@ -3382,7 +3484,6 @@ static void memcg_create_cache_work_func(struct work_struct *w)
 /*
 * Enqueue the creation of a per-memcg kmem_cache.
- * Called with rcu_read_lock.
 */
 static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
                                         struct kmem_cache *cachep)
@@ -3390,12 +3491,8 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
        struct create_work *cw;
        cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
-        if (cw == NULL)
+        if (cw == NULL) {
-                return;
+                css_put(&memcg->css);
-        /* The corresponding put will be done in the workqueue. */
-        if (!css_tryget(&memcg->css)) {
-                kfree(cw);
                return;
        }
@@ -3451,10 +3548,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
-        rcu_read_unlock();
        if (!memcg_can_account_kmem(memcg))
-                return cachep;
+                goto out;
        idx = memcg_cache_id(memcg);
@@ -3463,29 +3559,38 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
         * code updating memcg_caches will issue a write barrier to match this.
         */
        read_barrier_depends();
-        if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
+        if (likely(cachep->memcg_params->memcg_caches[idx])) {
-                /*
+                cachep = cachep->memcg_params->memcg_caches[idx];
-                 * If we are in a safe context (can wait, and not in interrupt
+                goto out;
-                 * context), we could be be predictable and return right away.
-                 * This would guarantee that the allocation being performed
-                 * already belongs in the new cache.
-                 *
-                 * However, there are some clashes that can arrive from locking.
-                 * For instance, because we acquire the slab_mutex while doing
-                 * kmem_cache_dup, this means no further allocation could happen
-                 * with the slab_mutex held.
-                 *
-                 * Also, because cache creation issue get_online_cpus(), this
-                 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
-                 * that ends up reversed during cpu hotplug. (cpuset allocates
-                 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
-                 * better to defer everything.
-                 */
-                memcg_create_cache_enqueue(memcg, cachep);
-                return cachep;
        }
-        return cachep->memcg_params->memcg_caches[idx];
+        /* The corresponding put will be done in the workqueue. */
+        if (!css_tryget(&memcg->css))
+                goto out;
+        rcu_read_unlock();
+        /*
+         * If we are in a safe context (can wait, and not in interrupt
+         * context), we could be be predictable and return right away.
+         * This would guarantee that the allocation being performed
+         * already belongs in the new cache.
+         *
+         * However, there are some clashes that can arrive from locking.
+         * For instance, because we acquire the slab_mutex while doing
+         * kmem_cache_dup, this means no further allocation could happen
+         * with the slab_mutex held.
+         *
+         * Also, because cache creation issue get_online_cpus(), this
+         * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
+         * that ends up reversed during cpu hotplug. (cpuset allocates
+         * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
+         * better to defer everything.
+         */
+        memcg_create_cache_enqueue(memcg, cachep);
+        return cachep;
+out:
+        rcu_read_unlock();
+        return cachep;
 }
 EXPORT_SYMBOL(__memcg_kmem_get_cache);
@@ -4947,9 +5052,6 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
        type = MEMFILE_TYPE(cft->private);
        name = MEMFILE_ATTR(cft->private);
-        if (!do_swap_account && type == _MEMSWAP)
-                return -EOPNOTSUPP;
        switch (type) {
        case _MEM:
                if (name == RES_USAGE)
@@ -5084,9 +5186,6 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
        type = MEMFILE_TYPE(cft->private);
        name = MEMFILE_ATTR(cft->private);
-        if (!do_swap_account && type == _MEMSWAP)
-                return -EOPNOTSUPP;
        switch (name) {
        case RES_LIMIT:
                if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
@@ -5163,9 +5262,6 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
        type = MEMFILE_TYPE(event);
        name = MEMFILE_ATTR(event);
-        if (!do_swap_account && type == _MEMSWAP)
-                return -EOPNOTSUPP;
        switch (name) {
        case RES_MAX_USAGE:
                if (type == _MEM)
@@ -5744,7 +5840,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
                return ret;
        return mem_cgroup_sockets_init(memcg, ss);
-};
+}
 static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
 {
@@ -5817,6 +5913,7 @@ static struct cftype mem_cgroup_files[] = {
        },
        {
                .name = "use_hierarchy",
+                .flags = CFTYPE_INSANE,
                .write_u64 = mem_cgroup_hierarchy_write,
                .read_u64 = mem_cgroup_hierarchy_read,
        },
@@ -5838,6 +5935,11 @@ static struct cftype mem_cgroup_files[] = {
                .unregister_event = mem_cgroup_oom_unregister_event,
                .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
        },
+        {
+                .name = "pressure_level",
+                .register_event = vmpressure_register_event,
+                .unregister_event = vmpressure_unregister_event,
+        },
 #ifdef CONFIG_NUMA
        {
                .name = "numa_stat",
@@ -6119,6 +6221,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
        memcg->move_charge_at_immigrate = 0;
        mutex_init(&memcg->thresholds_lock);
        spin_lock_init(&memcg->move_lock);
+        vmpressure_init(&memcg->vmpressure);
        return &memcg->css;
@@ -6184,10 +6287,29 @@ mem_cgroup_css_online(struct cgroup *cont)
        return error;
 }
+/*
+ * Announce all parents that a group from their hierarchy is gone.
+ */
+static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
+{
+        struct mem_cgroup *parent = memcg;
+        while ((parent = parent_mem_cgroup(parent)))
+                atomic_inc(&parent->dead_count);
+        /*
+         * if the root memcg is not hierarchical we have to check it
+         * explicitely.
+         */
+        if (!root_mem_cgroup->use_hierarchy)
+                atomic_inc(&root_mem_cgroup->dead_count);
+}
 static void mem_cgroup_css_offline(struct cgroup *cont)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+        mem_cgroup_invalidate_reclaim_iterators(memcg);
        mem_cgroup_reparent_charges(memcg);
        mem_cgroup_destroy_all_caches(memcg);
 }
@@ -6787,6 +6909,21 @@ static void mem_cgroup_move_task(struct cgroup *cont,
 }
 #endif
+/*
+ * Cgroup retains root cgroups across [un]mount cycles making it necessary
+ * to verify sane_behavior flag on each mount attempt.
+ */
+static void mem_cgroup_bind(struct cgroup *root)
+{
+        /*
+         * use_hierarchy is forced with sane_behavior.  cgroup core
+         * guarantees that @root doesn't have any children, so turning it
+         * on for the root memcg is enough.
+         */
+        if (cgroup_sane_behavior(root))
+                mem_cgroup_from_cont(root)->use_hierarchy = true;
+}
 struct cgroup_subsys mem_cgroup_subsys = {
        .name = "memory",
        .subsys_id = mem_cgroup_subsys_id,
@@ -6797,6 +6934,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
        .can_attach = mem_cgroup_can_attach,
        .cancel_attach = mem_cgroup_cancel_attach,
        .attach = mem_cgroup_move_task,
+        .bind = mem_cgroup_bind,
        .base_cftypes = mem_cgroup_files,
        .early_init = 0,
        .use_id = 1,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index df0694c6adef..ceb0c7f1932f 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -785,10 +785,10 @@ static struct page_state {
        { sc|dirty,     sc,             "clean swapcache",      me_swapcache_clean },
        { mlock|dirty,  mlock|dirty,    "dirty mlocked LRU",    me_pagecache_dirty },
-        { mlock,        mlock,          "clean mlocked LRU",    me_pagecache_clean },
+        { mlock|dirty,  mlock,          "clean mlocked LRU",    me_pagecache_clean },
        { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
-        { unevict,      unevict,        "clean unevictable LRU", me_pagecache_clean },
+        { unevict|dirty, unevict,       "clean unevictable LRU", me_pagecache_clean },
        { lru|dirty,    lru|dirty,      "dirty LRU",    me_pagecache_dirty },
        { lru|dirty,    lru,            "clean LRU",    me_pagecache_clean },
diff --git a/mm/memory.c b/mm/memory.c
index 494526ae024a..6dc1882fbd72 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -216,6 +216,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
        tlb->mm = mm;
        tlb->fullmm     = fullmm;
+        tlb->need_flush_all = 0;
        tlb->start      = -1UL;
        tlb->end        = 0;
        tlb->need_flush = 0;
@@ -714,11 +715,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
         * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
         */
        if (vma->vm_ops)
-                print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
+                printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
-                                (unsigned long)vma->vm_ops->fault);
+                       vma->vm_ops->fault);
        if (vma->vm_file && vma->vm_file->f_op)
-                print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
+                printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
-                                (unsigned long)vma->vm_file->f_op->mmap);
+                       vma->vm_file->f_op->mmap);
        dump_stack();
        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
@@ -2392,6 +2393,53 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 }
 EXPORT_SYMBOL(remap_pfn_range);
+/**
+ * vm_iomap_memory - remap memory to userspace
+ * @vma: user vma to map to
+ * @start: start of area
+ * @len: size of area
+ *
+ * This is a simplified io_remap_pfn_range() for common driver use. The
+ * driver just needs to give us the physical memory range to be mapped,
+ * we'll figure out the rest from the vma information.
+ *
+ * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
+ * whatever write-combining details or similar.
+ */
+int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
+{
+        unsigned long vm_len, pfn, pages;
+        /* Check that the physical memory area passed in looks valid */
+        if (start + len < start)
+                return -EINVAL;
+        /*
+         * You *really* shouldn't map things that aren't page-aligned,
+         * but we've historically allowed it because IO memory might
+         * just have smaller alignment.
+         */
+        len += start & ~PAGE_MASK;
+        pfn = start >> PAGE_SHIFT;
+        pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
+        if (pfn + pages < pfn)
+                return -EINVAL;
+        /* We start the mapping 'vm_pgoff' pages into the area */
+        if (vma->vm_pgoff > pages)
+                return -EINVAL;
+        pfn += vma->vm_pgoff;
+        pages -= vma->vm_pgoff;
+        /* Can we fit all of the mapping? */
+        vm_len = vma->vm_end - vma->vm_start;
+        if (vm_len >> PAGE_SHIFT > pages)
+                return -EINVAL;
+        /* Ok, let it rip */
+        return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_iomap_memory);
 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data)
@@ -3196,6 +3244,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        page = alloc_zeroed_user_highpage_movable(vma, address);
        if (!page)
                goto oom;
+        /*
+         * The memory barrier inside __SetPageUptodate makes sure that
+         * preceeding stores to the page contents become visible before
+         * the set_pte_at() write.
+         */
        __SetPageUptodate(page);
        if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b81a367b9f39..a221fac1f47d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -436,6 +436,40 @@ static int __meminit __add_section(int nid, struct zone *zone,
        return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 }
+/*
+ * Reasonably generic function for adding memory.  It is
+ * expected that archs that support memory hotplug will
+ * call this function after deciding the zone to which to
+ * add the new pages.
+ */
+int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
+                        unsigned long nr_pages)
+{
+        unsigned long i;
+        int err = 0;
+        int start_sec, end_sec;
+        /* during initialize mem_map, align hot-added range to section */
+        start_sec = pfn_to_section_nr(phys_start_pfn);
+        end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
+        for (i = start_sec; i <= end_sec; i++) {
+                err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
+                /*
+                 * EEXIST is finally dealt with by ioresource collision
+                 * check. see add_memory() => register_memory_resource()
+                 * Warning will be printed if there is collision.
+                 */
+                if (err && (err != -EEXIST))
+                        break;
+                err = 0;
+        }
+        return err;
+}
+EXPORT_SYMBOL_GPL(__add_pages);
+#ifdef CONFIG_MEMORY_HOTREMOVE
 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */
 static int find_smallest_section_pfn(int nid, struct zone *zone,
                                     unsigned long start_pfn,
@@ -658,39 +692,6 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
        return 0;
 }
-/*
- * Reasonably generic function for adding memory.  It is
- * expected that archs that support memory hotplug will
- * call this function after deciding the zone to which to
- * add the new pages.
- */
-int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
-                        unsigned long nr_pages)
-{
-        unsigned long i;
-        int err = 0;
-        int start_sec, end_sec;
-        /* during initialize mem_map, align hot-added range to section */
-        start_sec = pfn_to_section_nr(phys_start_pfn);
-        end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
-        for (i = start_sec; i <= end_sec; i++) {
-                err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
-                /*
-                 * EEXIST is finally dealt with by ioresource collision
-                 * check. see add_memory() => register_memory_resource()
-                 * Warning will be printed if there is collision.
-                 */
-                if (err && (err != -EEXIST))
-                        break;
-                err = 0;
-        }
-        return err;
-}
-EXPORT_SYMBOL_GPL(__add_pages);
 /**
 * __remove_pages() - remove sections of pages from a zone
 * @zone: zone from which pages need to be removed
@@ -705,8 +706,10 @@ EXPORT_SYMBOL_GPL(__add_pages);
 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
                 unsigned long nr_pages)
 {
-        unsigned long i, ret = 0;
+        unsigned long i;
        int sections_to_remove;
+        resource_size_t start, size;
+        int ret = 0;
        /*
         * We can only remove entire sections
@@ -714,7 +717,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
        BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
        BUG_ON(nr_pages % PAGES_PER_SECTION);
-        release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
+        start = phys_start_pfn << PAGE_SHIFT;
+        size = nr_pages * PAGE_SIZE;
+        ret = release_mem_region_adjustable(&iomem_resource, start, size);
+        if (ret)
+                pr_warn("Unable to release resource <%016llx-%016llx> (%d)\n",
+                                start, start + size - 1, ret);
        sections_to_remove = nr_pages / PAGES_PER_SECTION;
        for (i = 0; i < sections_to_remove; i++) {
@@ -726,6 +734,7 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
        return ret;
 }
 EXPORT_SYMBOL_GPL(__remove_pages);
+#endif /* CONFIG_MEMORY_HOTREMOVE */
 int set_online_page_callback(online_page_callback_t callback)
 {
@@ -1613,7 +1622,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 /**
 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
 * @start_pfn: start pfn of the memory range
- * @end_pfn: end pft of the memory range
+ * @end_pfn: end pfn of the memory range
 * @arg: argument passed to func
 * @func: callback for each memory section walked
 *
@@ -1681,11 +1690,15 @@ static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
 {
        int ret = !is_memblock_offlined(mem);
-        if (unlikely(ret))
+        if (unlikely(ret)) {
+                phys_addr_t beginpa, endpa;
+                beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
+                endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
                pr_warn("removing memory fails, because memory "
-                        "[%#010llx-%#010llx] is onlined\n",
+                        "[%pa-%pa] is onlined\n",
-                        PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
+                        &beginpa, &endpa);
-                        PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1);
+        }
        return ret;
 }
@@ -1779,7 +1792,11 @@ void try_offline_node(int nid)
        for (i = 0; i < MAX_NR_ZONES; i++) {
                struct zone *zone = pgdat->node_zones + i;
-                if (zone->wait_table)
+                /*
+                 * wait_table may be allocated from boot memory,
+                 * here only free if it's allocated by vmalloc.
+                 */
+                if (is_vmalloc_addr(zone->wait_table))
                        vfree(zone->wait_table);
        }
@@ -1801,7 +1818,7 @@ int __ref remove_memory(int nid, u64 start, u64 size)
        int retry = 1;
        start_pfn = PFN_DOWN(start);
-        end_pfn = start_pfn + PFN_DOWN(size);
+        end_pfn = PFN_UP(start + size - 1);
        /*
         * When CONFIG_MEMCG is on, one memory block may be used by other
diff --git a/mm/migrate.c b/mm/migrate.c
index 3bbaf5d230b0..27ed22579fd9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -736,7 +736,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
        if (PageWriteback(page)) {
                /*
-                 * Only in the case of a full syncronous migration is it
+                 * Only in the case of a full synchronous migration is it
                 * necessary to wait for PageWriteback. In the async case,
                 * the retry loop is too short and in the sync-light case,
                 * the overhead of stalling is too much
@@ -973,19 +973,23 @@ out:
 }
 /*
- * migrate_pages
+ * migrate_pages - migrate the pages specified in a list, to the free pages
+ *                 supplied as the target for the page migration
 *
- * The function takes one list of pages to migrate and a function
+ * @from:               The list of pages to be migrated.
- * that determines from the page to be migrated and the private data
+ * @get_new_page:       The function used to allocate free pages to be used
- * the target of the move and allocates the page.
+ *                      as the target of the page migration.
+ * @private:            Private data to be passed on to get_new_page()
+ * @mode:               The migration mode that specifies the constraints for
+ *                      page migration, if any.
+ * @reason:             The reason for page migration.
 *
- * The function returns after 10 attempts or if no pages
+ * The function returns after 10 attempts or if no pages are movable any more
- * are movable anymore because to has become empty
+ * because the list has become empty or no retryable pages exist any more.
- * or no retryable pages exist anymore.
+ * The caller should call putback_lru_pages() to return pages to the LRU
- * Caller should call putback_lru_pages to return pages to the LRU
 * or free list only if ret != 0.
 *
- * Return: Number of pages not migrated or error code.
+ * Returns the number of pages that were not migrated, or an error code.
 */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
                unsigned long private, enum migrate_mode mode, int reason)
diff --git a/mm/mlock.c b/mm/mlock.c
index 1c5e33fce639..79b7cf7d1bca 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -358,7 +358,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
                newflags = vma->vm_flags & ~VM_LOCKED;
                if (on)
-                        newflags |= VM_LOCKED | VM_POPULATE;
+                        newflags |= VM_LOCKED;
                tmp = vma->vm_end;
                if (tmp > end)
@@ -418,8 +418,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
                 * range with the first VMA. Also, skip undesirable VMA types.
                 */
                nend = min(end, vma->vm_end);
-                if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) !=
+                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
-                    VM_POPULATE)
                        continue;
                if (nstart < vma->vm_start)
                        nstart = vma->vm_start;
@@ -492,9 +491,9 @@ static int do_mlockall(int flags)
        struct vm_area_struct * vma, * prev = NULL;
        if (flags & MCL_FUTURE)
-                current->mm->def_flags |= VM_LOCKED | VM_POPULATE;
+                current->mm->def_flags |= VM_LOCKED;
        else
-                current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE);
+                current->mm->def_flags &= ~VM_LOCKED;
        if (flags == MCL_FUTURE)
                goto out;
@@ -503,7 +502,7 @@ static int do_mlockall(int flags)
                newflags = vma->vm_flags & ~VM_LOCKED;
                if (flags & MCL_CURRENT)
-                        newflags |= VM_LOCKED | VM_POPULATE;
+                        newflags |= VM_LOCKED;
                /* Ignore errors */
                mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
diff --git a/mm/mmap.c b/mm/mmap.c
index 2664a47cec93..da3e9c04bf37 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -6,6 +6,7 @@
 * Address space accounting code        <alan@lxorguk.ukuu.org.uk>
 */
+#include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/backing-dev.h>
 #include <linux/mm.h>
@@ -33,6 +34,8 @@
 #include <linux/uprobes.h>
 #include <linux/rbtree_augmented.h>
 #include <linux/sched/sysctl.h>
+#include <linux/notifier.h>
+#include <linux/memory.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -84,6 +87,8 @@ EXPORT_SYMBOL(vm_get_page_prot);
 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
 /*
 * Make sure vm_committed_as in one cacheline and not cacheline shared with
 * other variables. It can be updated by several CPUs frequently.
@@ -122,7 +127,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
 */
 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
-        unsigned long free, allowed;
+        unsigned long free, allowed, reserve;
        vm_acct_memory(pages);
@@ -163,10 +168,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                        free -= totalreserve_pages;
                /*
-                 * Leave the last 3% for root
+                 * Reserve some for root
                 */
                if (!cap_sys_admin)
-                        free -= free / 32;
+                        free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
                if (free > pages)
                        return 0;
@@ -177,16 +182,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
        allowed = (totalram_pages - hugetlb_total_pages())
                * sysctl_overcommit_ratio / 100;
        /*
-         * Leave the last 3% for root
+         * Reserve some for root
         */
        if (!cap_sys_admin)
-                allowed -= allowed / 32;
+                allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
        allowed += total_swap_pages;
-        /* Don't let a single process grow too big:
+        /*
-           leave 3% of the size of this process for other processes */
+         * Don't let a single process grow so big a user can't recover
-        if (mm)
+         */
-                allowed -= mm->total_vm / 32;
+        if (mm) {
+                reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+                allowed -= min(mm->total_vm / 32, reserve);
+        }
        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                return 0;
@@ -543,6 +551,34 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
        return 0;
 }
+static unsigned long count_vma_pages_range(struct mm_struct *mm,
+                unsigned long addr, unsigned long end)
+{
+        unsigned long nr_pages = 0;
+        struct vm_area_struct *vma;
+        /* Find first overlaping mapping */
+        vma = find_vma_intersection(mm, addr, end);
+        if (!vma)
+                return 0;
+        nr_pages = (min(end, vma->vm_end) -
+                max(addr, vma->vm_start)) >> PAGE_SHIFT;
+        /* Iterate over the rest of the overlaps */
+        for (vma = vma->vm_next; vma; vma = vma->vm_next) {
+                unsigned long overlap_len;
+                if (vma->vm_start > end)
+                        break;
+                overlap_len = min(end, vma->vm_end) - vma->vm_start;
+                nr_pages += overlap_len >> PAGE_SHIFT;
+        }
+        return nr_pages;
+}
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
                struct rb_node **rb_link, struct rb_node *rb_parent)
 {
@@ -829,7 +865,7 @@ again:			remove_next = 1 + (end > next->vm_end);
                if (next->anon_vma)
                        anon_vma_merge(vma, next);
                mm->map_count--;
-                mpol_put(vma_policy(next));
+                vma_set_policy(vma, vma_policy(next));
                kmem_cache_free(vm_area_cachep, next);
                /*
                 * In mprotect's case 6 (see comments on vma_merge),
@@ -1306,7 +1342,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
        }
        addr = mmap_region(file, addr, len, vm_flags, pgoff);
-        if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE))
+        if (!IS_ERR_VALUE(addr) &&
+            ((vm_flags & VM_LOCKED) ||
+             (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
                *populate = len;
        return addr;
 }
@@ -1433,6 +1471,23 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
        unsigned long charged = 0;
        struct inode *inode =  file ? file_inode(file) : NULL;
+        /* Check against address space limit. */
+        if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
+                unsigned long nr_pages;
+                /*
+                 * MAP_FIXED may remove pages of mappings that intersects with
+                 * requested mapping. Account for the pages it would unmap.
+                 */
+                if (!(vm_flags & MAP_FIXED))
+                        return -ENOMEM;
+                nr_pages = count_vma_pages_range(mm, addr, addr + len);
+                if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
+                        return -ENOMEM;
+        }
        /* Clear old maps */
        error = -ENOMEM;
 munmap_back:
@@ -1442,10 +1497,6 @@ munmap_back:
                goto munmap_back;
        }
-        /* Check against address space limit. */
-        if (!may_expand_vm(mm, len >> PAGE_SHIFT))
-                return -ENOMEM;
        /*
         * Private writable mapping: check memory availability
         */
@@ -1933,12 +1984,9 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
        struct vm_area_struct *vma = NULL;
-        if (WARN_ON_ONCE(!mm))          /* Remove this in linux-3.6 */
-                return NULL;
        /* Check the cache first. */
        /* (Cache hit rate is typically around 35%.) */
-        vma = mm->mmap_cache;
+        vma = ACCESS_ONCE(mm->mmap_cache);
        if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
                struct rb_node *rb_node;
@@ -2303,7 +2351,7 @@ static void unmap_region(struct mm_struct *mm,
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, vma, start, end);
        free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
-                                 next ? next->vm_start : 0);
+                                 next ? next->vm_start : USER_PGTABLES_CEILING);
        tlb_finish_mmu(&tlb, start, end);
 }
@@ -2683,7 +2731,7 @@ void exit_mmap(struct mm_struct *mm)
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        unmap_vmas(&tlb, vma, 0, -1);
-        free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+        free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
        tlb_finish_mmu(&tlb, 0, -1);
        /*
@@ -3095,3 +3143,115 @@ void __init mmap_init(void)
        ret = percpu_counter_init(&vm_committed_as, 0);
        VM_BUG_ON(ret);
 }
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int init_user_reserve(void)
+{
+        unsigned long free_kbytes;
+        free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+        sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+        return 0;
+}
+module_init(init_user_reserve)
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int init_admin_reserve(void)
+{
+        unsigned long free_kbytes;
+        free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+        sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+        return 0;
+}
+module_init(init_admin_reserve)
+/*
+ * Reinititalise user and admin reserves if memory is added or removed.
+ *
+ * The default user reserve max is 128MB, and the default max for the
+ * admin reserve is 8MB. These are usually, but not always, enough to
+ * enable recovery from a memory hogging process using login/sshd, a shell,
+ * and tools like top. It may make sense to increase or even disable the
+ * reserve depending on the existence of swap or variations in the recovery
+ * tools. So, the admin may have changed them.
+ *
+ * If memory is added and the reserves have been eliminated or increased above
+ * the default max, then we'll trust the admin.
+ *
+ * If memory is removed and there isn't enough free memory, then we
+ * need to reset the reserves.
+ *
+ * Otherwise keep the reserve set by the admin.
+ */
+static int reserve_mem_notifier(struct notifier_block *nb,
+                             unsigned long action, void *data)
+{
+        unsigned long tmp, free_kbytes;
+        switch (action) {
+        case MEM_ONLINE:
+                /* Default max is 128MB. Leave alone if modified by operator. */
+                tmp = sysctl_user_reserve_kbytes;
+                if (0 < tmp && tmp < (1UL << 17))
+                        init_user_reserve();
+                /* Default max is 8MB.  Leave alone if modified by operator. */
+                tmp = sysctl_admin_reserve_kbytes;
+                if (0 < tmp && tmp < (1UL << 13))
+                        init_admin_reserve();
+                break;
+        case MEM_OFFLINE:
+                free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+                if (sysctl_user_reserve_kbytes > free_kbytes) {
+                        init_user_reserve();
+                        pr_info("vm.user_reserve_kbytes reset to %lu\n",
+                                sysctl_user_reserve_kbytes);
+                }
+                if (sysctl_admin_reserve_kbytes > free_kbytes) {
+                        init_admin_reserve();
+                        pr_info("vm.admin_reserve_kbytes reset to %lu\n",
+                                sysctl_admin_reserve_kbytes);
+                }
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block reserve_mem_nb = {
+        .notifier_call = reserve_mem_notifier,
+};
+static int __meminit init_reserve_notifier(void)
+{
+        if (register_hotmemory_notifier(&reserve_mem_nb))
+                printk("Failed registering memory add/remove notifier for admin reserve");
+        return 0;
+}
+module_init(init_reserve_notifier)
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 5e07d36e381e..bdd3fa2fc73b 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -45,9 +45,9 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
        if (!addr)
                return NULL;
+        memblock_reserve(addr, size);
        ptr = phys_to_virt(addr);
        memset(ptr, 0, size);
-        memblock_reserve(addr, size);
        /*
         * The min_count is set to 0 so that bootmem allocated blocks
         * are never reported as leaks.
@@ -120,7 +120,7 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
        return end_pfn - start_pfn;
 }
-unsigned long __init free_low_memory_core_early(int nodeid)
+static unsigned long __init free_low_memory_core_early(void)
 {
        unsigned long count = 0;
        phys_addr_t start, end, size;
@@ -170,7 +170,7 @@ unsigned long __init free_all_bootmem(void)
         *  because in some case like Node0 doesn't have RAM installed
         *  low ram will be on Node1
         */
-        return free_low_memory_core_early(MAX_NUMNODES);
+        return free_low_memory_core_early();
 }
 /**
diff --git a/mm/nommu.c b/mm/nommu.c
index e19328087534..fbe3e2f317eb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -63,6 +63,8 @@ int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
 int heap_stack_gap = 0;
 atomic_long_t mmap_pages_allocated;
@@ -228,8 +230,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL(follow_pfn);
-DEFINE_RWLOCK(vmlist_lock);
+LIST_HEAD(vmap_area_list);
-struct vm_struct *vmlist;
 void vfree(const void *addr)
 {
@@ -821,7 +822,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
        struct vm_area_struct *vma;
        /* check the cache first */
-        vma = mm->mmap_cache;
+        vma = ACCESS_ONCE(mm->mmap_cache);
        if (vma && vma->vm_start <= addr && vma->vm_end > addr)
                return vma;
@@ -1838,6 +1839,16 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 }
 EXPORT_SYMBOL(remap_pfn_range);
+int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
+{
+        unsigned long pfn = start >> PAGE_SHIFT;
+        unsigned long vm_len = vma->vm_end - vma->vm_start;
+        pfn += vma->vm_pgoff;
+        return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_iomap_memory);
 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                        unsigned long pgoff)
 {
@@ -1888,7 +1899,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
 */
 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
-        unsigned long free, allowed;
+        unsigned long free, allowed, reserve;
        vm_acct_memory(pages);
@@ -1929,10 +1940,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                        free -= totalreserve_pages;
                /*
-                 * Leave the last 3% for root
+                 * Reserve some for root
                 */
                if (!cap_sys_admin)
-                        free -= free / 32;
+                        free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
                if (free > pages)
                        return 0;
@@ -1942,16 +1953,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
        allowed = totalram_pages * sysctl_overcommit_ratio / 100;
        /*
-         * Leave the last 3% for root
+         * Reserve some 3% for root
         */
        if (!cap_sys_admin)
-                allowed -= allowed / 32;
+                allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
        allowed += total_swap_pages;
-        /* Don't let a single process grow too big:
+        /*
-           leave 3% of the size of this process for other processes */
+         * Don't let a single process grow so big a user can't recover
-        if (mm)
+         */
-                allowed -= mm->total_vm / 32;
+        if (mm) {
+                reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+                allowed -= min(mm->total_vm / 32, reserve);
+        }
        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                return 0;
@@ -2113,3 +2127,45 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
        up_write(&nommu_region_sem);
        return 0;
 }
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int __meminit init_user_reserve(void)
+{
+        unsigned long free_kbytes;
+        free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+        sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+        return 0;
+}
+module_init(init_user_reserve)
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int __meminit init_admin_reserve(void)
+{
+        unsigned long free_kbytes;
+        free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+        sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+        return 0;
+}
+module_init(init_admin_reserve)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index efe68148f621..4514ad7415c3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2311,10 +2311,6 @@ void wait_for_stable_page(struct page *page)
        if (!bdi_cap_stable_pages_required(bdi))
                return;
-#ifdef CONFIG_NEED_BOUNCE_POOL
-        if (mapping->host->i_sb->s_flags & MS_SNAP_STABLE)
-                return;
-#endif /* CONFIG_NEED_BOUNCE_POOL */
        wait_on_page_writeback(page);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8fcced7823fa..98cbdf6e5532 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -58,6 +58,7 @@
 #include <linux/prefetch.h>
 #include <linux/migrate.h>
 #include <linux/page-debug-flags.h>
+#include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
 #include <asm/tlbflush.h>
@@ -1397,6 +1398,7 @@ void split_page(struct page *page, unsigned int order)
        for (i = 1; i < (1 << order); i++)
                set_page_refcounted(page + i);
 }
+EXPORT_SYMBOL_GPL(split_page);
 static int __isolate_free_page(struct page *page, unsigned int order)
 {
@@ -1940,9 +1942,24 @@ zonelist_scan:
                                continue;
                        default:
                                /* did we reclaim enough */
-                                if (!zone_watermark_ok(zone, order, mark,
+                                if (zone_watermark_ok(zone, order, mark,
                                                classzone_idx, alloc_flags))
+                                        goto try_this_zone;
+                                /*
+                                 * Failed to reclaim enough to meet watermark.
+                                 * Only mark the zone full if checking the min
+                                 * watermark or if we failed to reclaim just
+                                 * 1<<order pages or else the page allocator
+                                 * fastpath will prematurely mark zones full
+                                 * when the watermark is between the low and
+                                 * min watermarks.
+                                 */
+                                if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
+                                    ret == ZONE_RECLAIM_SOME)
                                        goto this_zone_full;
+                                continue;
                        }
                }
@@ -2002,6 +2019,13 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
                return;
        /*
+         * Walking all memory to count page types is very expensive and should
+         * be inhibited in non-blockable contexts.
+         */
+        if (!(gfp_mask & __GFP_WAIT))
+                filter |= SHOW_MEM_FILTER_PAGE_COUNT;
+        /*
         * This documents exceptions given to allocations in certain
         * contexts that are allowed to allocate outside current's set
         * of allowed nodes.
@@ -3105,6 +3129,8 @@ void show_free_areas(unsigned int filter)
                printk("= %lukB\n", K(total));
        }
+        hugetlb_show_meminfo();
        printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
        show_swap_cache_info();
@@ -4161,10 +4187,23 @@ int __meminit __early_pfn_to_nid(unsigned long pfn)
 {
        unsigned long start_pfn, end_pfn;
        int i, nid;
+        /*
+         * NOTE: The following SMP-unsafe globals are only used early in boot
+         * when the kernel is running single-threaded.
+         */
+        static unsigned long __meminitdata last_start_pfn, last_end_pfn;
+        static int __meminitdata last_nid;
+        if (last_start_pfn <= pfn && pfn < last_end_pfn)
+                return last_nid;
        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
-                if (start_pfn <= pfn && pfn < end_pfn)
+                if (start_pfn <= pfn && pfn < end_pfn) {
+                        last_start_pfn = start_pfn;
+                        last_end_pfn = end_pfn;
+                        last_nid = nid;
                        return nid;
+                }
        /* This is a memory hole */
        return -1;
 }
@@ -4710,7 +4749,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 /*
 * Figure out the number of possible node ids.
 */
-static void __init setup_nr_node_ids(void)
+void __init setup_nr_node_ids(void)
 {
        unsigned int node;
        unsigned int highest = 0;
@@ -4719,10 +4758,6 @@ static void __init setup_nr_node_ids(void)
                highest = node;
        nr_node_ids = highest + 1;
 }
-#else
-static inline void setup_nr_node_ids(void)
-{
-}
 #endif
 /**
@@ -5113,6 +5148,35 @@ early_param("movablecore", cmdline_parse_movablecore);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+unsigned long free_reserved_area(unsigned long start, unsigned long end,
+                                 int poison, char *s)
+{
+        unsigned long pages, pos;
+        pos = start = PAGE_ALIGN(start);
+        end &= PAGE_MASK;
+        for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) {
+                if (poison)
+                        memset((void *)pos, poison, PAGE_SIZE);
+                free_reserved_page(virt_to_page(pos));
+        }
+        if (pages && s)
+                pr_info("Freeing %s memory: %ldK (%lx - %lx)\n",
+                        s, pages << (PAGE_SHIFT - 10), start, end);
+        return pages;
+}
+#ifdef  CONFIG_HIGHMEM
+void free_highmem_page(struct page *page)
+{
+        __free_reserved_page(page);
+        totalram_pages++;
+        totalhigh_pages++;
+}
+#endif
 /**
 * set_dma_reserve - set the specified number of pages reserved in the first zone
 * @new_dma_reserve: The number of pages to mark reserved
diff --git a/mm/page_io.c b/mm/page_io.c
index 78eee32ee486..bb5d75274686 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -42,7 +42,7 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
        return bio;
 }
-static void end_swap_bio_write(struct bio *bio, int err)
+void end_swap_bio_write(struct bio *bio, int err)
 {
        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        struct page *page = bio->bi_io_vec[0].bv_page;
@@ -185,9 +185,7 @@ bad_bmap:
 */
 int swap_writepage(struct page *page, struct writeback_control *wbc)
 {
-        struct bio *bio;
+        int ret = 0;
-        int ret = 0, rw = WRITE;
-        struct swap_info_struct *sis = page_swap_info(page);
        if (try_to_free_swap(page)) {
                unlock_page(page);
@@ -199,6 +197,17 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
                end_page_writeback(page);
                goto out;
        }
+        ret = __swap_writepage(page, wbc, end_swap_bio_write);
+out:
+        return ret;
+}
+int __swap_writepage(struct page *page, struct writeback_control *wbc,
+        void (*end_write_func)(struct bio *, int))
+{
+        struct bio *bio;
+        int ret = 0, rw = WRITE;
+        struct swap_info_struct *sis = page_swap_info(page);
        if (sis->flags & SWP_FILE) {
                struct kiocb kiocb;
@@ -214,6 +223,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
                kiocb.ki_left = PAGE_SIZE;
                kiocb.ki_nbytes = PAGE_SIZE;
+                set_page_writeback(page);
                unlock_page(page);
                ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
                                                &kiocb, &iov,
@@ -222,11 +232,27 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
                if (ret == PAGE_SIZE) {
                        count_vm_event(PSWPOUT);
                        ret = 0;
+                } else {
+                        /*
+                         * In the case of swap-over-nfs, this can be a
+                         * temporary failure if the system has limited
+                         * memory for allocating transmit buffers.
+                         * Mark the page dirty and avoid
+                         * rotate_reclaimable_page but rate-limit the
+                         * messages but do not flag PageError like
+                         * the normal direct-to-bio case as it could
+                         * be temporary.
+                         */
+                        set_page_dirty(page);
+                        ClearPageReclaim(page);
+                        pr_err_ratelimited("Write error on dio swapfile (%Lu)\n",
+                                page_file_offset(page));
                }
+                end_page_writeback(page);
                return ret;
        }
-        bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
+        bio = get_swap_bio(GFP_NOIO, page, end_write_func);
        if (bio == NULL) {
                set_page_dirty(page);
                unlock_page(page);
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 926b46649749..fd26d0433509 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -429,12 +429,6 @@ compat_process_vm_rw(compat_pid_t pid,
        if (flags != 0)
                return -EINVAL;
-        if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec)))
-                goto out;
-        if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec)))
-                goto out;
        if (vm_write)
                rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
                                                  UIO_FASTIOV, iovstack_l,
@@ -459,8 +453,6 @@ free_iovecs:
                kfree(iov_r);
        if (iov_l != iovstack_l)
                kfree(iov_l);
-out:
        return rc;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 807c96bf0dc6..6280da86b5d6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1513,6 +1513,9 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        unsigned long max_nl_size = 0;
        unsigned int mapcount;
+        if (PageHuge(page))
+                pgoff = page->index << compound_order(page);
        mutex_lock(&mapping->i_mmap_mutex);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
diff --git a/mm/shmem.c b/mm/shmem.c
index 1c44af71fcf5..39b2a0b86fe8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -25,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/vfs.h>
 #include <linux/mount.h>
+#include <linux/ramfs.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/mm.h>
@@ -2830,8 +2831,6 @@ out4:
 * effectively equivalent, but much lighter weight.
 */
-#include <linux/ramfs.h>
 static struct file_system_type shmem_fs_type = {
        .name           = "tmpfs",
        .mount          = ramfs_mount,
@@ -2931,11 +2930,9 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
        d_instantiate(path.dentry, inode);
        inode->i_size = size;
        clear_nlink(inode);     /* It is unlinked */
-#ifndef CONFIG_MMU
        res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
        if (IS_ERR(res))
                goto put_dentry;
-#endif
        res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
                  &shmem_file_operations);
diff --git a/mm/slab.c b/mm/slab.c
index 856e4a192d25..96079244c860 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2040,11 +2040,9 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
        }
        if (cachep->flags & SLAB_STORE_USER) {
-                printk(KERN_ERR "Last user: [<%p>]",
+                printk(KERN_ERR "Last user: [<%p>](%pSR)\n",
-                        *dbg_userword(cachep, objp));
+                       *dbg_userword(cachep, objp),
-                print_symbol("(%s)",
+                       *dbg_userword(cachep, objp));
-                                (unsigned long)*dbg_userword(cachep, objp));
-                printk("\n");
        }
        realobj = (char *)objp + obj_offset(cachep);
        size = cachep->object_size;
diff --git a/mm/slub.c b/mm/slub.c
index 4aec53705e4f..a0206df88aba 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include "slab.h"
 #include <linux/proc_fs.h>
+#include <linux/notifier.h>
 #include <linux/seq_file.h>
 #include <linux/kmemcheck.h>
 #include <linux/cpu.h>
@@ -3483,7 +3484,6 @@ int kmem_cache_shrink(struct kmem_cache *s)
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
-#if defined(CONFIG_MEMORY_HOTPLUG)
 static int slab_mem_going_offline_callback(void *arg)
 {
        struct kmem_cache *s;
@@ -3598,7 +3598,10 @@ static int slab_memory_callback(struct notifier_block *self,
        return ret;
 }
-#endif /* CONFIG_MEMORY_HOTPLUG */
+static struct notifier_block slab_memory_callback_nb = {
+        .notifier_call = slab_memory_callback,
+        .priority = SLAB_CALLBACK_PRI,
+};
 /********************************************************************
 *                      Basic setup of slabs
@@ -3651,7 +3654,7 @@ void __init kmem_cache_init(void)
        create_boot_cache(kmem_cache_node, "kmem_cache_node",
                sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
-        hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+        register_hotmemory_notifier(&slab_memory_callback_nb);
        /* Able to allocate the per node structures */
        slab_state = PARTIAL;
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 1b7e22ab9b09..27eeab3be757 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -53,10 +53,12 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
                struct page *page;
                if (node_state(node, N_HIGH_MEMORY))
-                        page = alloc_pages_node(node,
+                        page = alloc_pages_node(
-                                GFP_KERNEL | __GFP_ZERO, get_order(size));
+                                node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
+                                get_order(size));
                else
-                        page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
+                        page = alloc_pages(
+                                GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
                                get_order(size));
                if (page)
                        return page_address(page);
@@ -145,11 +147,10 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
        return pgd;
 }
-int __meminit vmemmap_populate_basepages(struct page *start_page,
+int __meminit vmemmap_populate_basepages(unsigned long start,
-                                                unsigned long size, int node)
+                                         unsigned long end, int node)
 {
-        unsigned long addr = (unsigned long)start_page;
+        unsigned long addr = start;
-        unsigned long end = (unsigned long)(start_page + size);
        pgd_t *pgd;
        pud_t *pud;
        pmd_t *pmd;
@@ -176,9 +177,15 @@ int __meminit vmemmap_populate_basepages(struct page *start_page,
 struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
 {
-        struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION);
+        unsigned long start;
-        int error = vmemmap_populate(map, PAGES_PER_SECTION, nid);
+        unsigned long end;
-        if (error)
+        struct page *map;
+        map = pfn_to_page(pnum * PAGES_PER_SECTION);
+        start = (unsigned long)map;
+        end = (unsigned long)(map + PAGES_PER_SECTION);
+        if (vmemmap_populate(start, end, nid))
                return NULL;
        return map;
diff --git a/mm/sparse.c b/mm/sparse.c
index 7ca6dc847947..1c91f0d3f6ab 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -615,12 +615,20 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
 }
 static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 {
-        vmemmap_free(memmap, nr_pages);
+        unsigned long start = (unsigned long)memmap;
+        unsigned long end = (unsigned long)(memmap + nr_pages);
+        vmemmap_free(start, end);
 }
+#ifdef CONFIG_MEMORY_HOTREMOVE
 static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
 {
-        vmemmap_free(memmap, nr_pages);
+        unsigned long start = (unsigned long)memmap;
+        unsigned long end = (unsigned long)(memmap + nr_pages);
+        vmemmap_free(start, end);
 }
+#endif /* CONFIG_MEMORY_HOTREMOVE */
 #else
 static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
 {
@@ -658,6 +666,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
                           get_order(sizeof(struct page) * nr_pages));
 }
+#ifdef CONFIG_MEMORY_HOTREMOVE
 static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
 {
        unsigned long maps_section_nr, removing_section_nr, i;
@@ -684,40 +693,9 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
                        put_page_bootmem(page);
        }
 }
+#endif /* CONFIG_MEMORY_HOTREMOVE */
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
-static void free_section_usemap(struct page *memmap, unsigned long *usemap)
-{
-        struct page *usemap_page;
-        unsigned long nr_pages;
-        if (!usemap)
-                return;
-        usemap_page = virt_to_page(usemap);
-        /*
-         * Check to see if allocation came from hot-plug-add
-         */
-        if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
-                kfree(usemap);
-                if (memmap)
-                        __kfree_section_memmap(memmap, PAGES_PER_SECTION);
-                return;
-        }
-        /*
-         * The usemap came from bootmem. This is packed with other usemaps
-         * on the section which has pgdat at boot time. Just keep it as is now.
-         */
-        if (memmap) {
-                nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
-                        >> PAGE_SHIFT;
-                free_map_bootmem(memmap, nr_pages);
-        }
-}
 /*
 * returns the number of sections whose mem_maps were properly
 * set.  If this is <=0, then that means that the passed-in
@@ -794,6 +772,39 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 }
 #endif
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static void free_section_usemap(struct page *memmap, unsigned long *usemap)
+{
+        struct page *usemap_page;
+        unsigned long nr_pages;
+        if (!usemap)
+                return;
+        usemap_page = virt_to_page(usemap);
+        /*
+         * Check to see if allocation came from hot-plug-add
+         */
+        if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
+                kfree(usemap);
+                if (memmap)
+                        __kfree_section_memmap(memmap, PAGES_PER_SECTION);
+                return;
+        }
+        /*
+         * The usemap came from bootmem. This is packed with other usemaps
+         * on the section which has pgdat at boot time. Just keep it as is now.
+         */
+        if (memmap) {
+                nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
+                        >> PAGE_SHIFT;
+                free_map_bootmem(memmap, nr_pages);
+        }
+}
 void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
 {
        struct page *memmap = NULL;
@@ -813,4 +824,5 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
        clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
        free_section_usemap(memmap, usemap);
 }
-#endif
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/swap.c b/mm/swap.c
index 8a529a01e8fc..acd40bfffa82 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -737,7 +737,7 @@ EXPORT_SYMBOL(__pagevec_release);
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /* used by __split_huge_page_refcount() */
 void lru_add_page_tail(struct page *page, struct page *page_tail,
-                       struct lruvec *lruvec)
+                       struct lruvec *lruvec, struct list_head *list)
 {
        int uninitialized_var(active);
        enum lru_list lru;
@@ -749,7 +749,8 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
        VM_BUG_ON(NR_CPUS != 1 &&
                  !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
-        SetPageLRU(page_tail);
+        if (!list)
+                SetPageLRU(page_tail);
        if (page_evictable(page_tail)) {
                if (PageActive(page)) {
@@ -767,7 +768,11 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
        if (likely(PageLRU(page)))
                list_add_tail(&page_tail->lru, &page->lru);
-        else {
+        else if (list) {
+                /* page reclaim is reclaiming a huge page */
+                get_page(page_tail);
+                list_add_tail(&page_tail->lru, list);
+        } else {
                struct list_head *list_head;
                /*
                 * Head page has not yet been counted, as an hpage,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7efcf1525921..b3d40dcf3624 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -78,7 +78,7 @@ void show_swap_cache_info(void)
 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
 * but sets SwapCache flag and private instead of mapping and index.
 */
-static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
+int __add_to_swap_cache(struct page *page, swp_entry_t entry)
 {
        int error;
        struct address_space *address_space;
@@ -160,7 +160,7 @@ void __delete_from_swap_cache(struct page *page)
 * Allocate swap space for the page and add the page to the
 * swap cache.  Caller needs to hold the page lock. 
 */
-int add_to_swap(struct page *page)
+int add_to_swap(struct page *page, struct list_head *list)
 {
        swp_entry_t entry;
        int err;
@@ -173,7 +173,7 @@ int add_to_swap(struct page *page)
                return 0;
        if (unlikely(PageTransHuge(page)))
-                if (unlikely(split_huge_page(page))) {
+                if (unlikely(split_huge_page_to_list(page, list))) {
                        swapcache_free(entry, NULL);
                        return 0;
                }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a1f7772a01fc..d417efddfe74 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2120,7 +2120,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        if (p->bdev) {
                if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
                        p->flags |= SWP_SOLIDSTATE;
-                        p->cluster_next = 1 + (random32() % p->highest_bit);
+                        p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
                }
                if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
                        p->flags |= SWP_DISCARDABLE;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0f751f2068c3..72043d6c88c0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -249,19 +249,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 #define VM_LAZY_FREEING 0x02
 #define VM_VM_AREA      0x04
-struct vmap_area {
-        unsigned long va_start;
-        unsigned long va_end;
-        unsigned long flags;
-        struct rb_node rb_node;         /* address sorted rbtree */
-        struct list_head list;          /* address sorted list */
-        struct list_head purge_list;    /* "lazy purge" list */
-        struct vm_struct *vm;
-        struct rcu_head rcu_head;
-};
 static DEFINE_SPINLOCK(vmap_area_lock);
-static LIST_HEAD(vmap_area_list);
+/* Export for kexec only */
+LIST_HEAD(vmap_area_list);
 static struct rb_root vmap_area_root = RB_ROOT;
 /* The vmap cache globals are protected by vmap_area_lock */
@@ -313,7 +303,7 @@ static void __insert_vmap_area(struct vmap_area *va)
        rb_link_node(&va->rb_node, parent, p);
        rb_insert_color(&va->rb_node, &vmap_area_root);
-        /* address-sort this list so it is usable like the vmlist */
+        /* address-sort this list */
        tmp = rb_prev(&va->rb_node);
        if (tmp) {
                struct vmap_area *prev;
@@ -1125,6 +1115,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
 }
 EXPORT_SYMBOL(vm_map_ram);
+static struct vm_struct *vmlist __initdata;
 /**
 * vm_area_add_early - add vmap area early during boot
 * @vm: vm_struct to add
@@ -1283,41 +1274,35 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 }
 EXPORT_SYMBOL_GPL(map_vm_area);
-/*** Old vmalloc interfaces ***/
-DEFINE_RWLOCK(vmlist_lock);
-struct vm_struct *vmlist;
 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
                              unsigned long flags, const void *caller)
 {
+        spin_lock(&vmap_area_lock);
        vm->flags = flags;
        vm->addr = (void *)va->va_start;
        vm->size = va->va_end - va->va_start;
        vm->caller = caller;
        va->vm = vm;
        va->flags |= VM_VM_AREA;
+        spin_unlock(&vmap_area_lock);
 }
-static void insert_vmalloc_vmlist(struct vm_struct *vm)
+static void clear_vm_unlist(struct vm_struct *vm)
 {
-        struct vm_struct *tmp, **p;
+        /*
+         * Before removing VM_UNLIST,
+         * we should make sure that vm has proper values.
+         * Pair with smp_rmb() in show_numa_info().
+         */
+        smp_wmb();
        vm->flags &= ~VM_UNLIST;
-        write_lock(&vmlist_lock);
-        for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
-                if (tmp->addr >= vm->addr)
-                        break;
-        }
-        vm->next = *p;
-        *p = vm;
-        write_unlock(&vmlist_lock);
 }
 static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
                              unsigned long flags, const void *caller)
 {
        setup_vmalloc_vm(vm, va, flags, caller);
-        insert_vmalloc_vmlist(vm);
+        clear_vm_unlist(vm);
 }
 static struct vm_struct *__get_vm_area_node(unsigned long size,
@@ -1360,10 +1345,9 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
        /*
         * When this function is called from __vmalloc_node_range,
-         * we do not add vm_struct to vmlist here to avoid
+         * we add VM_UNLIST flag to avoid accessing uninitialized
-         * accessing uninitialized members of vm_struct such as
+         * members of vm_struct such as pages and nr_pages fields.
-         * pages and nr_pages fields. They will be set later.
+         * They will be set later.
-         * To distinguish it from others, we use a VM_UNLIST flag.
         */
        if (flags & VM_UNLIST)
                setup_vmalloc_vm(area, va, flags, caller);
@@ -1447,19 +1431,10 @@ struct vm_struct *remove_vm_area(const void *addr)
        if (va && va->flags & VM_VM_AREA) {
                struct vm_struct *vm = va->vm;
-                if (!(vm->flags & VM_UNLIST)) {
+                spin_lock(&vmap_area_lock);
-                        struct vm_struct *tmp, **p;
+                va->vm = NULL;
-                        /*
+                va->flags &= ~VM_VM_AREA;
-                         * remove from list and disallow access to
+                spin_unlock(&vmap_area_lock);
-                         * this vm_struct before unmap. (address range
-                         * confliction is maintained by vmap.)
-                         */
-                        write_lock(&vmlist_lock);
-                        for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
-                                ;
-                        *p = tmp->next;
-                        write_unlock(&vmlist_lock);
-                }
                vmap_debug_free_range(va->va_start, va->va_end);
                free_unmap_vmap_area(va);
@@ -1680,10 +1655,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
                return NULL;
        /*
-         * In this function, newly allocated vm_struct is not added
+         * In this function, newly allocated vm_struct has VM_UNLIST flag.
-         * to vmlist at __get_vm_area_node(). so, it is added here.
+         * It means that vm_struct is not fully initialized.
+         * Now, it is fully initialized, so remove this flag here.
         */
-        insert_vmalloc_vmlist(area);
+        clear_vm_unlist(area);
        /*
         * A ref_count = 3 is needed because the vm_struct and vmap_area
@@ -2005,7 +1981,8 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
 long vread(char *buf, char *addr, unsigned long count)
 {
-        struct vm_struct *tmp;
+        struct vmap_area *va;
+        struct vm_struct *vm;
        char *vaddr, *buf_start = buf;
        unsigned long buflen = count;
        unsigned long n;
@@ -2014,10 +1991,17 @@ long vread(char *buf, char *addr, unsigned long count)
        if ((unsigned long) addr + count < count)
                count = -(unsigned long) addr;
-        read_lock(&vmlist_lock);
+        spin_lock(&vmap_area_lock);
-        for (tmp = vmlist; count && tmp; tmp = tmp->next) {
+        list_for_each_entry(va, &vmap_area_list, list) {
-                vaddr = (char *) tmp->addr;
+                if (!count)
-                if (addr >= vaddr + tmp->size - PAGE_SIZE)
+                        break;
+                if (!(va->flags & VM_VM_AREA))
+                        continue;
+                vm = va->vm;
+                vaddr = (char *) vm->addr;
+                if (addr >= vaddr + vm->size - PAGE_SIZE)
                        continue;
                while (addr < vaddr) {
                        if (count == 0)
@@ -2027,10 +2011,10 @@ long vread(char *buf, char *addr, unsigned long count)
                        addr++;
                        count--;
                }
-                n = vaddr + tmp->size - PAGE_SIZE - addr;
+                n = vaddr + vm->size - PAGE_SIZE - addr;
                if (n > count)
                        n = count;
-                if (!(tmp->flags & VM_IOREMAP))
+                if (!(vm->flags & VM_IOREMAP))
                        aligned_vread(buf, addr, n);
                else /* IOREMAP area is treated as memory hole */
                        memset(buf, 0, n);
@@ -2039,7 +2023,7 @@ long vread(char *buf, char *addr, unsigned long count)
                count -= n;
        }
 finished:
-        read_unlock(&vmlist_lock);
+        spin_unlock(&vmap_area_lock);
        if (buf == buf_start)
                return 0;
@@ -2078,7 +2062,8 @@ finished:
 long vwrite(char *buf, char *addr, unsigned long count)
 {
-        struct vm_struct *tmp;
+        struct vmap_area *va;
+        struct vm_struct *vm;
        char *vaddr;
        unsigned long n, buflen;
        int copied = 0;
@@ -2088,10 +2073,17 @@ long vwrite(char *buf, char *addr, unsigned long count)
                count = -(unsigned long) addr;
        buflen = count;
-        read_lock(&vmlist_lock);
+        spin_lock(&vmap_area_lock);
-        for (tmp = vmlist; count && tmp; tmp = tmp->next) {
+        list_for_each_entry(va, &vmap_area_list, list) {
-                vaddr = (char *) tmp->addr;
+                if (!count)
-                if (addr >= vaddr + tmp->size - PAGE_SIZE)
+                        break;
+                if (!(va->flags & VM_VM_AREA))
+                        continue;
+                vm = va->vm;
+                vaddr = (char *) vm->addr;
+                if (addr >= vaddr + vm->size - PAGE_SIZE)
                        continue;
                while (addr < vaddr) {
                        if (count == 0)
@@ -2100,10 +2092,10 @@ long vwrite(char *buf, char *addr, unsigned long count)
                        addr++;
                        count--;
                }
-                n = vaddr + tmp->size - PAGE_SIZE - addr;
+                n = vaddr + vm->size - PAGE_SIZE - addr;
                if (n > count)
                        n = count;
-                if (!(tmp->flags & VM_IOREMAP)) {
+                if (!(vm->flags & VM_IOREMAP)) {
                        aligned_vwrite(buf, addr, n);
                        copied++;
                }
@@ -2112,7 +2104,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
                count -= n;
        }
 finished:
-        read_unlock(&vmlist_lock);
+        spin_unlock(&vmap_area_lock);
        if (!copied)
                return 0;
        return buflen;
@@ -2519,19 +2511,19 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 #ifdef CONFIG_PROC_FS
 static void *s_start(struct seq_file *m, loff_t *pos)
-        __acquires(&vmlist_lock)
+        __acquires(&vmap_area_lock)
 {
        loff_t n = *pos;
-        struct vm_struct *v;
+        struct vmap_area *va;
-        read_lock(&vmlist_lock);
+        spin_lock(&vmap_area_lock);
-        v = vmlist;
+        va = list_entry((&vmap_area_list)->next, typeof(*va), list);
-        while (n > 0 && v) {
+        while (n > 0 && &va->list != &vmap_area_list) {
                n--;
-                v = v->next;
+                va = list_entry(va->list.next, typeof(*va), list);
        }
-        if (!n)
+        if (!n && &va->list != &vmap_area_list)
-                return v;
+                return va;
        return NULL;
@@ -2539,16 +2531,20 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
-        struct vm_struct *v = p;
+        struct vmap_area *va = p, *next;
        ++*pos;
-        return v->next;
+        next = list_entry(va->list.next, typeof(*va), list);
+        if (&next->list != &vmap_area_list)
+                return next;
+        return NULL;
 }
 static void s_stop(struct seq_file *m, void *p)
-        __releases(&vmlist_lock)
+        __releases(&vmap_area_lock)
 {
-        read_unlock(&vmlist_lock);
+        spin_unlock(&vmap_area_lock);
 }
 static void show_numa_info(struct seq_file *m, struct vm_struct *v)
@@ -2559,6 +2555,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
                if (!counters)
                        return;
+                /* Pair with smp_wmb() in clear_vm_unlist() */
+                smp_rmb();
+                if (v->flags & VM_UNLIST)
+                        return;
                memset(counters, 0, nr_node_ids * sizeof(unsigned int));
                for (nr = 0; nr < v->nr_pages; nr++)
@@ -2572,7 +2573,20 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
 static int s_show(struct seq_file *m, void *p)
 {
-        struct vm_struct *v = p;
+        struct vmap_area *va = p;
+        struct vm_struct *v;
+        if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING))
+                return 0;
+        if (!(va->flags & VM_VM_AREA)) {
+                seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
+                        (void *)va->va_start, (void *)va->va_end,
+                                        va->va_end - va->va_start);
+                return 0;
+        }
+        v = va->vm;
        seq_printf(m, "0x%pK-0x%pK %7ld",
                v->addr, v->addr + v->size, v->size);
@@ -2645,5 +2659,53 @@ static int __init proc_vmalloc_init(void)
        return 0;
 }
 module_init(proc_vmalloc_init);
+void get_vmalloc_info(struct vmalloc_info *vmi)
+{
+        struct vmap_area *va;
+        unsigned long free_area_size;
+        unsigned long prev_end;
+        vmi->used = 0;
+        vmi->largest_chunk = 0;
+        prev_end = VMALLOC_START;
+        spin_lock(&vmap_area_lock);
+        if (list_empty(&vmap_area_list)) {
+                vmi->largest_chunk = VMALLOC_TOTAL;
+                goto out;
+        }
+        list_for_each_entry(va, &vmap_area_list, list) {
+                unsigned long addr = va->va_start;
+                /*
+                 * Some archs keep another range for modules in vmalloc space
+                 */
+                if (addr < VMALLOC_START)
+                        continue;
+                if (addr >= VMALLOC_END)
+                        break;
+                if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING))
+                        continue;
+                vmi->used += (va->va_end - va->va_start);
+                free_area_size = addr - prev_end;
+                if (vmi->largest_chunk < free_area_size)
+                        vmi->largest_chunk = free_area_size;
+                prev_end = va->va_end;
+        }
+        if (VMALLOC_END - prev_end > vmi->largest_chunk)
+                vmi->largest_chunk = VMALLOC_END - prev_end;
+out:
+        spin_unlock(&vmap_area_lock);
+}
 #endif
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
new file mode 100644
index 000000000000..736a6011c2c8
--- /dev/null
+++ b/mm/vmpressure.c
@@ -0,0 +1,374 @@
+/*
+ * Linux VM pressure
+ *
+ * Copyright 2012 Linaro Ltd.
+ *                Anton Vorontsov <anton.vorontsov@linaro.org>
+ *
+ * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
+ * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/log2.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/eventfd.h>
+#include <linux/swap.h>
+#include <linux/printk.h>
+#include <linux/vmpressure.h>
+/*
+ * The window size (vmpressure_win) is the number of scanned pages before
+ * we try to analyze scanned/reclaimed ratio. So the window is used as a
+ * rate-limit tunable for the "low" level notification, and also for
+ * averaging the ratio for medium/critical levels. Using small window
+ * sizes can cause lot of false positives, but too big window size will
+ * delay the notifications.
+ *
+ * As the vmscan reclaimer logic works with chunks which are multiple of
+ * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
+ *
+ * TODO: Make the window size depend on machine size, as we do for vmstat
+ * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
+ */
+static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
+/*
+ * These thresholds are used when we account memory pressure through
+ * scanned/reclaimed ratio. The current values were chosen empirically. In
+ * essence, they are percents: the higher the value, the more number
+ * unsuccessful reclaims there were.
+ */
+static const unsigned int vmpressure_level_med = 60;
+static const unsigned int vmpressure_level_critical = 95;
+/*
+ * When there are too little pages left to scan, vmpressure() may miss the
+ * critical pressure as number of pages will be less than "window size".
+ * However, in that case the vmscan priority will raise fast as the
+ * reclaimer will try to scan LRUs more deeply.
+ *
+ * The vmscan logic considers these special priorities:
+ *
+ * prio == DEF_PRIORITY (12): reclaimer starts with that value
+ * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
+ * prio == 0                : close to OOM, kernel scans every page in an lru
+ *
+ * Any value in this range is acceptable for this tunable (i.e. from 12 to
+ * 0). Current value for the vmpressure_level_critical_prio is chosen
+ * empirically, but the number, in essence, means that we consider
+ * critical level when scanning depth is ~10% of the lru size (vmscan
+ * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
+ * eights).
+ */
+static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
+static struct vmpressure *work_to_vmpressure(struct work_struct *work)
+{
+        return container_of(work, struct vmpressure, work);
+}
+static struct vmpressure *cg_to_vmpressure(struct cgroup *cg)
+{
+        return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id));
+}
+static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
+{
+        struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup;
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cg);
+        memcg = parent_mem_cgroup(memcg);
+        if (!memcg)
+                return NULL;
+        return memcg_to_vmpressure(memcg);
+}
+enum vmpressure_levels {
+        VMPRESSURE_LOW = 0,
+        VMPRESSURE_MEDIUM,
+        VMPRESSURE_CRITICAL,
+        VMPRESSURE_NUM_LEVELS,
+};
+static const char * const vmpressure_str_levels[] = {
+        [VMPRESSURE_LOW] = "low",
+        [VMPRESSURE_MEDIUM] = "medium",
+        [VMPRESSURE_CRITICAL] = "critical",
+};
+static enum vmpressure_levels vmpressure_level(unsigned long pressure)
+{
+        if (pressure >= vmpressure_level_critical)
+                return VMPRESSURE_CRITICAL;
+        else if (pressure >= vmpressure_level_med)
+                return VMPRESSURE_MEDIUM;
+        return VMPRESSURE_LOW;
+}
+static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
+                                                    unsigned long reclaimed)
+{
+        unsigned long scale = scanned + reclaimed;
+        unsigned long pressure;
+        /*
+         * We calculate the ratio (in percents) of how many pages were
+         * scanned vs. reclaimed in a given time frame (window). Note that
+         * time is in VM reclaimer's "ticks", i.e. number of pages
+         * scanned. This makes it possible to set desired reaction time
+         * and serves as a ratelimit.
+         */
+        pressure = scale - (reclaimed * scale / scanned);
+        pressure = pressure * 100 / scale;
+        pr_debug("%s: %3lu  (s: %lu  r: %lu)\n", __func__, pressure,
+                 scanned, reclaimed);
+        return vmpressure_level(pressure);
+}
+struct vmpressure_event {
+        struct eventfd_ctx *efd;
+        enum vmpressure_levels level;
+        struct list_head node;
+};
+static bool vmpressure_event(struct vmpressure *vmpr,
+                             unsigned long scanned, unsigned long reclaimed)
+{
+        struct vmpressure_event *ev;
+        enum vmpressure_levels level;
+        bool signalled = false;
+        level = vmpressure_calc_level(scanned, reclaimed);
+        mutex_lock(&vmpr->events_lock);
+        list_for_each_entry(ev, &vmpr->events, node) {
+                if (level >= ev->level) {
+                        eventfd_signal(ev->efd, 1);
+                        signalled = true;
+                }
+        }
+        mutex_unlock(&vmpr->events_lock);
+        return signalled;
+}
+static void vmpressure_work_fn(struct work_struct *work)
+{
+        struct vmpressure *vmpr = work_to_vmpressure(work);
+        unsigned long scanned;
+        unsigned long reclaimed;
+        /*
+         * Several contexts might be calling vmpressure(), so it is
+         * possible that the work was rescheduled again before the old
+         * work context cleared the counters. In that case we will run
+         * just after the old work returns, but then scanned might be zero
+         * here. No need for any locks here since we don't care if
+         * vmpr->reclaimed is in sync.
+         */
+        if (!vmpr->scanned)
+                return;
+        mutex_lock(&vmpr->sr_lock);
+        scanned = vmpr->scanned;
+        reclaimed = vmpr->reclaimed;
+        vmpr->scanned = 0;
+        vmpr->reclaimed = 0;
+        mutex_unlock(&vmpr->sr_lock);
+        do {
+                if (vmpressure_event(vmpr, scanned, reclaimed))
+                        break;
+                /*
+                 * If not handled, propagate the event upward into the
+                 * hierarchy.
+                 */
+        } while ((vmpr = vmpressure_parent(vmpr)));
+}
+/**
+ * vmpressure() - Account memory pressure through scanned/reclaimed ratio
+ * @gfp:        reclaimer's gfp mask
+ * @memcg:      cgroup memory controller handle
+ * @scanned:    number of pages scanned
+ * @reclaimed:  number of pages reclaimed
+ *
+ * This function should be called from the vmscan reclaim path to account
+ * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
+ * pressure index is then further refined and averaged over time.
+ *
+ * This function does not return any value.
+ */
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+                unsigned long scanned, unsigned long reclaimed)
+{
+        struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+        /*
+         * Here we only want to account pressure that userland is able to
+         * help us with. For example, suppose that DMA zone is under
+         * pressure; if we notify userland about that kind of pressure,
+         * then it will be mostly a waste as it will trigger unnecessary
+         * freeing of memory by userland (since userland is more likely to
+         * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
+         * is why we include only movable, highmem and FS/IO pages.
+         * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
+         * we account it too.
+         */
+        if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
+                return;
+        /*
+         * If we got here with no pages scanned, then that is an indicator
+         * that reclaimer was unable to find any shrinkable LRUs at the
+         * current scanning depth. But it does not mean that we should
+         * report the critical pressure, yet. If the scanning priority
+         * (scanning depth) goes too high (deep), we will be notified
+         * through vmpressure_prio(). But so far, keep calm.
+         */
+        if (!scanned)
+                return;
+        mutex_lock(&vmpr->sr_lock);
+        vmpr->scanned += scanned;
+        vmpr->reclaimed += reclaimed;
+        scanned = vmpr->scanned;
+        mutex_unlock(&vmpr->sr_lock);
+        if (scanned < vmpressure_win || work_pending(&vmpr->work))
+                return;
+        schedule_work(&vmpr->work);
+}
+/**
+ * vmpressure_prio() - Account memory pressure through reclaimer priority level
+ * @gfp:        reclaimer's gfp mask
+ * @memcg:      cgroup memory controller handle
+ * @prio:       reclaimer's priority
+ *
+ * This function should be called from the reclaim path every time when
+ * the vmscan's reclaiming priority (scanning depth) changes.
+ *
+ * This function does not return any value.
+ */
+void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
+{
+        /*
+         * We only use prio for accounting critical level. For more info
+         * see comment for vmpressure_level_critical_prio variable above.
+         */
+        if (prio > vmpressure_level_critical_prio)
+                return;
+        /*
+         * OK, the prio is below the threshold, updating vmpressure
+         * information before shrinker dives into long shrinking of long
+         * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
+         * to the vmpressure() basically means that we signal 'critical'
+         * level.
+         */
+        vmpressure(gfp, memcg, vmpressure_win, 0);
+}
+/**
+ * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
+ * @cg:         cgroup that is interested in vmpressure notifications
+ * @cft:        cgroup control files handle
+ * @eventfd:    eventfd context to link notifications with
+ * @args:       event arguments (used to set up a pressure level threshold)
+ *
+ * This function associates eventfd context with the vmpressure
+ * infrastructure, so that the notifications will be delivered to the
+ * @eventfd. The @args parameter is a string that denotes pressure level
+ * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
+ * "critical").
+ *
+ * This function should not be used directly, just pass it to (struct
+ * cftype).register_event, and then cgroup core will handle everything by
+ * itself.
+ */
+int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
+                              struct eventfd_ctx *eventfd, const char *args)
+{
+        struct vmpressure *vmpr = cg_to_vmpressure(cg);
+        struct vmpressure_event *ev;
+        int level;
+        for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
+                if (!strcmp(vmpressure_str_levels[level], args))
+                        break;
+        }
+        if (level >= VMPRESSURE_NUM_LEVELS)
+                return -EINVAL;
+        ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+        if (!ev)
+                return -ENOMEM;
+        ev->efd = eventfd;
+        ev->level = level;
+        mutex_lock(&vmpr->events_lock);
+        list_add(&ev->node, &vmpr->events);
+        mutex_unlock(&vmpr->events_lock);
+        return 0;
+}
+/**
+ * vmpressure_unregister_event() - Unbind eventfd from vmpressure
+ * @cg:         cgroup handle
+ * @cft:        cgroup control files handle
+ * @eventfd:    eventfd context that was used to link vmpressure with the @cg
+ *
+ * This function does internal manipulations to detach the @eventfd from
+ * the vmpressure notifications, and then frees internal resources
+ * associated with the @eventfd (but the @eventfd itself is not freed).
+ *
+ * This function should not be used directly, just pass it to (struct
+ * cftype).unregister_event, and then cgroup core will handle everything
+ * by itself.
+ */
+void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
+                                 struct eventfd_ctx *eventfd)
+{
+        struct vmpressure *vmpr = cg_to_vmpressure(cg);
+        struct vmpressure_event *ev;
+        mutex_lock(&vmpr->events_lock);
+        list_for_each_entry(ev, &vmpr->events, node) {
+                if (ev->efd != eventfd)
+                        continue;
+                list_del(&ev->node);
+                kfree(ev);
+                break;
+        }
+        mutex_unlock(&vmpr->events_lock);
+}
+/**
+ * vmpressure_init() - Initialize vmpressure control structure
+ * @vmpr:       Structure to be initialized
+ *
+ * This function should be called on every allocated vmpressure structure
+ * before any usage.
+ */
+void vmpressure_init(struct vmpressure *vmpr)
+{
+        mutex_init(&vmpr->sr_lock);
+        mutex_init(&vmpr->events_lock);
+        INIT_LIST_HEAD(&vmpr->events);
+        INIT_WORK(&vmpr->work, vmpressure_work_fn);
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 88c5fed8b9a4..fa6a85378ee4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
+#include <linux/vmpressure.h>
 #include <linux/vmstat.h>
 #include <linux/file.h>
 #include <linux/writeback.h>
@@ -780,7 +781,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (PageAnon(page) && !PageSwapCache(page)) {
                        if (!(sc->gfp_mask & __GFP_IO))
                                goto keep_locked;
-                        if (!add_to_swap(page))
+                        if (!add_to_swap(page, page_list))
                                goto activate_locked;
                        may_enter_fs = 1;
                }
@@ -1982,6 +1983,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
                        }
                        memcg = mem_cgroup_iter(root, memcg, &reclaim);
                } while (memcg);
+                vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
+                           sc->nr_scanned - nr_scanned,
+                           sc->nr_reclaimed - nr_reclaimed);
        } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
                                         sc->nr_scanned - nr_scanned, sc));
 }
@@ -2167,6 +2173,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                count_vm_event(ALLOCSTALL);
        do {
+                vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
+                                sc->priority);
                sc->nr_scanned = 0;
                aborted_reclaim = shrink_zones(zonelist, sc);
@@ -2619,7 +2627,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
        bool pgdat_is_balanced = false;
        int i;
        int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
-        unsigned long total_scanned;
        struct reclaim_state *reclaim_state = current->reclaim_state;
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
@@ -2639,7 +2646,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                .gfp_mask = sc.gfp_mask,
        };
 loop_again:
-        total_scanned = 0;
        sc.priority = DEF_PRIORITY;
        sc.nr_reclaimed = 0;
        sc.may_writepage = !laptop_mode;
@@ -2730,7 +2736,6 @@ loop_again:
                                                        order, sc.gfp_mask,
                                                        &nr_soft_scanned);
                        sc.nr_reclaimed += nr_soft_reclaimed;
-                        total_scanned += nr_soft_scanned;
                        /*
                         * We put equal pressure on every zone, unless
@@ -2765,7 +2770,6 @@ loop_again:
                                reclaim_state->reclaimed_slab = 0;
                                nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
                                sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-                                total_scanned += sc.nr_scanned;
                                if (nr_slab == 0 && !zone_reclaimable(zone))
                                        zone->all_unreclaimable = 1;
@@ -3188,9 +3192,9 @@ int kswapd_run(int nid)
        if (IS_ERR(pgdat->kswapd)) {
                /* failure at boot is fatal */
                BUG_ON(system_state == SYSTEM_BOOTING);
-                pgdat->kswapd = NULL;
                pr_err("Failed to start kswapd on node %d\n", nid);
                ret = PTR_ERR(pgdat->kswapd);
+                pgdat->kswapd = NULL;
        }
        return ret;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e1d8ed172c42..f42745e65780 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -52,7 +52,6 @@ void all_vm_events(unsigned long *ret)
 }
 EXPORT_SYMBOL_GPL(all_vm_events);
-#ifdef CONFIG_HOTPLUG
 /*
 * Fold the foreign cpu events into our own.
 *
@@ -69,7 +68,6 @@ void vm_events_fold_cpu(int cpu)
                fold_state->event[i] = 0;
        }
 }
-#endif /* CONFIG_HOTPLUG */
 #endif /* CONFIG_VM_EVENT_COUNTERS */
@@ -495,6 +493,10 @@ void refresh_cpu_vm_stats(int cpu)
                        atomic_long_add(global_diff[i], &vm_stat[i]);
 }
+/*
+ * this is only called if !populated_zone(zone), which implies no other users of
+ * pset->vm_stat_diff[] exsist.
+ */
 void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
 {
        int i;