11 files changed, 630 insertions, 309 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 08d357522e78..eaa4a5bbe063 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -81,7 +81,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
                nr_more_io++;
        spin_unlock(&inode_lock);
-        get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
+        global_dirty_limits(&background_thresh, &dirty_thresh);
+        bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
 #define K(x) ((x) << (PAGE_SHIFT - 10))
        seq_printf(m,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b61d2db9f34e..cc5be788a39f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,9 @@
 #include <linux/bootmem.h>
 #include <linux/sysfs.h>
 #include <linux/slab.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -220,6 +223,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
                        (vma->vm_pgoff >> huge_page_order(h));
 }
+pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
+                                     unsigned long address)
+{
+        return vma_hugecache_offset(hstate_vma(vma), vma, address);
+}
 /*
 * Return the size of the pages allocated when backing a VMA. In the majority
 * cases this will be same size as used by the page table entries.
@@ -552,6 +561,7 @@ static void free_huge_page(struct page *page)
        set_page_private(page, 0);
        page->mapping = NULL;
        BUG_ON(page_count(page));
+        BUG_ON(page_mapcount(page));
        INIT_LIST_HEAD(&page->lru);
        spin_lock(&hugetlb_lock);
@@ -605,6 +615,8 @@ int PageHuge(struct page *page)
        return dtor == free_huge_page;
 }
+EXPORT_SYMBOL_GPL(PageHuge);
 static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
        struct page *page;
@@ -2129,6 +2141,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                        entry = huge_ptep_get(src_pte);
                        ptepage = pte_page(entry);
                        get_page(ptepage);
+                        page_dup_rmap(ptepage);
                        set_huge_pte_at(dst, addr, dst_pte, entry);
                }
                spin_unlock(&src->page_table_lock);
@@ -2140,6 +2153,19 @@ nomem:
        return -ENOMEM;
 }
+static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+        swp_entry_t swp;
+        if (huge_pte_none(pte) || pte_present(pte))
+                return 0;
+        swp = pte_to_swp_entry(pte);
+        if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
+                return 1;
+        } else
+                return 0;
+}
 void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                            unsigned long end, struct page *ref_page)
 {
@@ -2198,6 +2224,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                if (huge_pte_none(pte))
                        continue;
+                /*
+                 * HWPoisoned hugepage is already unmapped and dropped reference
+                 */
+                if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
+                        continue;
                page = pte_page(pte);
                if (pte_dirty(pte))
                        set_page_dirty(page);
@@ -2207,6 +2239,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        flush_tlb_range(vma, start, end);
        mmu_notifier_invalidate_range_end(mm, start, end);
        list_for_each_entry_safe(page, tmp, &page_list, lru) {
+                page_remove_rmap(page);
                list_del(&page->lru);
                put_page(page);
        }
@@ -2272,6 +2305,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
        return 1;
 }
+/*
+ * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ */
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *ptep, pte_t pte,
                        struct page *pagecache_page)
@@ -2286,8 +2322,13 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 retry_avoidcopy:
        /* If no-one else is actually using this page, avoid the copy
         * and just make the page writable */
-        avoidcopy = (page_count(old_page) == 1);
+        avoidcopy = (page_mapcount(old_page) == 1);
        if (avoidcopy) {
+                if (!trylock_page(old_page)) {
+                        if (PageAnon(old_page))
+                                page_move_anon_rmap(old_page, vma, address);
+                } else
+                        unlock_page(old_page);
                set_huge_ptep_writable(vma, address, ptep);
                return 0;
        }
@@ -2338,6 +2379,13 @@ retry_avoidcopy:
                return -PTR_ERR(new_page);
        }
+        /*
+         * When the original hugepage is shared one, it does not have
+         * anon_vma prepared.
+         */
+        if (unlikely(anon_vma_prepare(vma)))
+                return VM_FAULT_OOM;
        copy_huge_page(new_page, old_page, address, vma);
        __SetPageUptodate(new_page);
@@ -2355,6 +2403,8 @@ retry_avoidcopy:
                huge_ptep_clear_flush(vma, address, ptep);
                set_huge_pte_at(mm, address, ptep,
                                make_huge_pte(vma, new_page, 1));
+                page_remove_rmap(old_page);
+                hugepage_add_anon_rmap(new_page, vma, address);
                /* Make the old page be freed below */
                new_page = old_page;
                mmu_notifier_invalidate_range_end(mm,
@@ -2458,10 +2508,29 @@ retry:
                        spin_lock(&inode->i_lock);
                        inode->i_blocks += blocks_per_huge_page(h);
                        spin_unlock(&inode->i_lock);
+                        page_dup_rmap(page);
                } else {
                        lock_page(page);
-                        page->mapping = HUGETLB_POISON;
+                        if (unlikely(anon_vma_prepare(vma))) {
+                                ret = VM_FAULT_OOM;
+                                goto backout_unlocked;
+                        }
+                        hugepage_add_new_anon_rmap(page, vma, address);
                }
+        } else {
+                page_dup_rmap(page);
+        }
+        /*
+         * Since memory error handler replaces pte into hwpoison swap entry
+         * at the time of error handling, a process which reserved but not have
+         * the mapping to the error hugepage does not have hwpoison swap entry.
+         * So we need to block accesses from such a process by checking
+         * PG_hwpoison bit here.
+         */
+        if (unlikely(PageHWPoison(page))) {
+                ret = VM_FAULT_HWPOISON;
+                goto backout_unlocked;
        }
        /*
@@ -2513,10 +2582,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_t *ptep;
        pte_t entry;
        int ret;
+        struct page *page = NULL;
        struct page *pagecache_page = NULL;
        static DEFINE_MUTEX(hugetlb_instantiation_mutex);
        struct hstate *h = hstate_vma(vma);
+        ptep = huge_pte_offset(mm, address);
+        if (ptep) {
+                entry = huge_ptep_get(ptep);
+                if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+                        return VM_FAULT_HWPOISON;
+        }
        ptep = huge_pte_alloc(mm, address, huge_page_size(h));
        if (!ptep)
                return VM_FAULT_OOM;
@@ -2554,6 +2631,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                                                vma, address);
        }
+        if (!pagecache_page) {
+                page = pte_page(entry);
+                lock_page(page);
+        }
        spin_lock(&mm->page_table_lock);
        /* Check for a racing update before calling hugetlb_cow */
        if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
@@ -2579,6 +2661,8 @@ out_page_table_lock:
        if (pagecache_page) {
                unlock_page(pagecache_page);
                put_page(pagecache_page);
+        } else {
+                unlock_page(page);
        }
 out_mutex:
@@ -2791,3 +2875,19 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        hugetlb_put_quota(inode->i_mapping, (chg - freed));
        hugetlb_acct_memory(h, -(chg - freed));
 }
+/*
+ * This function is called from memory failure code.
+ * Assume the caller holds page lock of the head page.
+ */
+void __isolate_hwpoisoned_huge_page(struct page *hpage)
+{
+        struct hstate *h = page_hstate(hpage);
+        int nid = page_to_nid(hpage);
+        spin_lock(&hugetlb_lock);
+        list_del(&hpage->lru);
+        h->free_huge_pages--;
+        h->free_huge_pages_node[nid]--;
+        spin_unlock(&hugetlb_lock);
+}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 10ea71905c1f..0948f1072d6b 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -5,6 +5,7 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
+#include <linux/hugetlb.h>
 #include "internal.h"
 static struct dentry *hwpoison_dir;
@@ -13,6 +14,7 @@ static int hwpoison_inject(void *data, u64 val)
 {
        unsigned long pfn = val;
        struct page *p;
+        struct page *hpage;
        int err;
        if (!capable(CAP_SYS_ADMIN))
@@ -24,18 +26,19 @@ static int hwpoison_inject(void *data, u64 val)
                return -ENXIO;
        p = pfn_to_page(pfn);
+        hpage = compound_head(p);
        /*
         * This implies unable to support free buddy pages.
         */
-        if (!get_page_unless_zero(p))
+        if (!get_page_unless_zero(hpage))
                return 0;
-        if (!PageLRU(p))
+        if (!PageLRU(p) && !PageHuge(p))
                shake_page(p, 0);
        /*
         * This implies unable to support non-LRU pages.
         */
-        if (!PageLRU(p))
+        if (!PageLRU(p) && !PageHuge(p))
                return 0;
        /*
@@ -44,9 +47,9 @@ static int hwpoison_inject(void *data, u64 val)
         * We temporarily take page lock for try_get_mem_cgroup_from_page().
         * __memory_failure() will redo the check reliably inside page lock.
         */
-        lock_page(p);
+        lock_page(hpage);
-        err = hwpoison_filter(p);
+        err = hwpoison_filter(hpage);
-        unlock_page(p);
+        unlock_page(hpage);
        if (err)
                return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0576e9e64586..3eed583895a6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -47,6 +47,7 @@
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
+#include <linux/oom.h>
 #include "internal.h"
 #include <asm/uaccess.h>
@@ -268,6 +269,7 @@ enum move_type {
 /* "mc" and its members are protected by cgroup_mutex */
 static struct move_charge_struct {
+        spinlock_t        lock; /* for from, to, moving_task */
        struct mem_cgroup *from;
        struct mem_cgroup *to;
        unsigned long precharge;
@@ -276,6 +278,7 @@ static struct move_charge_struct {
        struct task_struct *moving_task;        /* a task moving charges */
        wait_queue_head_t waitq;                /* a waitq for other context */
 } mc = {
+        .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
        .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 };
@@ -836,12 +839,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 {
        int ret;
        struct mem_cgroup *curr = NULL;
+        struct task_struct *p;
-        task_lock(task);
+        p = find_lock_task_mm(task);
-        rcu_read_lock();
+        if (!p)
-        curr = try_get_mem_cgroup_from_mm(task->mm);
+                return 0;
-        rcu_read_unlock();
+        curr = try_get_mem_cgroup_from_mm(p->mm);
-        task_unlock(task);
+        task_unlock(p);
        if (!curr)
                return 0;
        /*
@@ -915,7 +919,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
                                       struct zone *zone,
                                       enum lru_list lru)
 {
-        int nid = zone->zone_pgdat->node_id;
+        int nid = zone_to_nid(zone);
        int zid = zone_idx(zone);
        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
@@ -925,7 +929,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
                                                      struct zone *zone)
 {
-        int nid = zone->zone_pgdat->node_id;
+        int nid = zone_to_nid(zone);
        int zid = zone_idx(zone);
        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
@@ -970,7 +974,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
        LIST_HEAD(pc_list);
        struct list_head *src;
        struct page_cgroup *pc, *tmp;
-        int nid = z->zone_pgdat->node_id;
+        int nid = zone_to_nid(z);
        int zid = zone_idx(z);
        struct mem_cgroup_per_zone *mz;
        int lru = LRU_FILE * file + active;
@@ -1047,6 +1051,47 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
        return swappiness;
 }
+/* A routine for testing mem is not under move_account */
+static bool mem_cgroup_under_move(struct mem_cgroup *mem)
+{
+        struct mem_cgroup *from;
+        struct mem_cgroup *to;
+        bool ret = false;
+        /*
+         * Unlike task_move routines, we access mc.to, mc.from not under
+         * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
+         */
+        spin_lock(&mc.lock);
+        from = mc.from;
+        to = mc.to;
+        if (!from)
+                goto unlock;
+        if (from == mem || to == mem
+            || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
+            || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
+                ret = true;
+unlock:
+        spin_unlock(&mc.lock);
+        return ret;
+}
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
+{
+        if (mc.moving_task && current != mc.moving_task) {
+                if (mem_cgroup_under_move(mem)) {
+                        DEFINE_WAIT(wait);
+                        prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
+                        /* moving charge context might have finished. */
+                        if (mc.moving_task)
+                                schedule();
+                        finish_wait(&mc.waitq, &wait);
+                        return true;
+                }
+        }
+        return false;
+}
 static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
 {
        int *val = data;
@@ -1255,8 +1300,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                /* we use swappiness of local cgroup */
                if (check_soft)
                        ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
-                                noswap, get_swappiness(victim), zone,
+                                noswap, get_swappiness(victim), zone);
-                                zone->zone_pgdat->node_id);
                else
                        ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
                                                noswap, get_swappiness(victim));
@@ -1363,7 +1407,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
 static void memcg_oom_recover(struct mem_cgroup *mem)
 {
-        if (atomic_read(&mem->oom_lock))
+        if (mem && atomic_read(&mem->oom_lock))
                memcg_wakeup_oom(mem);
 }
@@ -1575,16 +1619,83 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
        return NOTIFY_OK;
 }
+/* See __mem_cgroup_try_charge() for details */
+enum {
+        CHARGE_OK,              /* success */
+        CHARGE_RETRY,           /* need to retry but retry is not bad */
+        CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
+        CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
+        CHARGE_OOM_DIE,         /* the current is killed because of OOM */
+};
+static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+                                int csize, bool oom_check)
+{
+        struct mem_cgroup *mem_over_limit;
+        struct res_counter *fail_res;
+        unsigned long flags = 0;
+        int ret;
+        ret = res_counter_charge(&mem->res, csize, &fail_res);
+        if (likely(!ret)) {
+                if (!do_swap_account)
+                        return CHARGE_OK;
+                ret = res_counter_charge(&mem->memsw, csize, &fail_res);
+                if (likely(!ret))
+                        return CHARGE_OK;
+                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+        } else
+                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+        if (csize > PAGE_SIZE) /* change csize and retry */
+                return CHARGE_RETRY;
+        if (!(gfp_mask & __GFP_WAIT))
+                return CHARGE_WOULDBLOCK;
+        ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+                                        gfp_mask, flags);
+        /*
+         * try_to_free_mem_cgroup_pages() might not give us a full
+         * picture of reclaim. Some pages are reclaimed and might be
+         * moved to swap cache or just unmapped from the cgroup.
+         * Check the limit again to see if the reclaim reduced the
+         * current usage of the cgroup before giving up
+         */
+        if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+                return CHARGE_RETRY;
+        /*
+         * At task move, charge accounts can be doubly counted. So, it's
+         * better to wait until the end of task_move if something is going on.
+         */
+        if (mem_cgroup_wait_acct_move(mem_over_limit))
+                return CHARGE_RETRY;
+        /* If we don't need to call oom-killer at el, return immediately */
+        if (!oom_check)
+                return CHARGE_NOMEM;
+        /* check OOM */
+        if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
+                return CHARGE_OOM_DIE;
+        return CHARGE_RETRY;
+}
 /*
 * Unlike exported interface, "oom" parameter is added. if oom==true,
 * oom-killer can be invoked.
 */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
-                        gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+                gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
 {
-        struct mem_cgroup *mem, *mem_over_limit;
+        int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+        struct mem_cgroup *mem = NULL;
-        struct res_counter *fail_res;
+        int ret;
        int csize = CHARGE_SIZE;
        /*
@@ -1602,126 +1713,108 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
         * thread group leader migrates. It's possible that mm is not
         * set, if so charge the init_mm (happens for pagecache usage).
         */
-        mem = *memcg;
+        if (!*memcg && !mm)
-        if (likely(!mem)) {
+                goto bypass;
-                mem = try_get_mem_cgroup_from_mm(mm);
+again:
-                *memcg = mem;
+        if (*memcg) { /* css should be a valid one */
-        } else {
+                mem = *memcg;
-                css_get(&mem->css);
+                VM_BUG_ON(css_is_removed(&mem->css));
-        }
+                if (mem_cgroup_is_root(mem))
-        if (unlikely(!mem))
+                        goto done;
-                return 0;
-        VM_BUG_ON(css_is_removed(&mem->css));
-        if (mem_cgroup_is_root(mem))
-                goto done;
-        while (1) {
-                int ret = 0;
-                unsigned long flags = 0;
                if (consume_stock(mem))
                        goto done;
+                css_get(&mem->css);
+        } else {
+                struct task_struct *p;
-                ret = res_counter_charge(&mem->res, csize, &fail_res);
+                rcu_read_lock();
-                if (likely(!ret)) {
+                p = rcu_dereference(mm->owner);
-                        if (!do_swap_account)
+                VM_BUG_ON(!p);
-                                break;
-                        ret = res_counter_charge(&mem->memsw, csize, &fail_res);
-                        if (likely(!ret))
-                                break;
-                        /* mem+swap counter fails */
-                        res_counter_uncharge(&mem->res, csize);
-                        flags |= MEM_CGROUP_RECLAIM_NOSWAP;
-                        mem_over_limit = mem_cgroup_from_res_counter(fail_res,
-                                                                        memsw);
-                } else
-                        /* mem counter fails */
-                        mem_over_limit = mem_cgroup_from_res_counter(fail_res,
-                                                                        res);
-                /* reduce request size and retry */
-                if (csize > PAGE_SIZE) {
-                        csize = PAGE_SIZE;
-                        continue;
-                }
-                if (!(gfp_mask & __GFP_WAIT))
-                        goto nomem;
-                ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
-                                                gfp_mask, flags);
-                if (ret)
-                        continue;
                /*
-                 * try_to_free_mem_cgroup_pages() might not give us a full
+                 * because we don't have task_lock(), "p" can exit while
-                 * picture of reclaim. Some pages are reclaimed and might be
+                 * we're here. In that case, "mem" can point to root
-                 * moved to swap cache or just unmapped from the cgroup.
+                 * cgroup but never be NULL. (and task_struct itself is freed
-                 * Check the limit again to see if the reclaim reduced the
+                 * by RCU, cgroup itself is RCU safe.) Then, we have small
-                 * current usage of the cgroup before giving up
+                 * risk here to get wrong cgroup. But such kind of mis-account
-                 *
+                 * by race always happens because we don't have cgroup_mutex().
+                 * It's overkill and we allow that small race, here.
                 */
-                if (mem_cgroup_check_under_limit(mem_over_limit))
+                mem = mem_cgroup_from_task(p);
-                        continue;
+                VM_BUG_ON(!mem);
+                if (mem_cgroup_is_root(mem)) {
-                /* try to avoid oom while someone is moving charge */
+                        rcu_read_unlock();
-                if (mc.moving_task && current != mc.moving_task) {
+                        goto done;
-                        struct mem_cgroup *from, *to;
+                }
-                        bool do_continue = false;
+                if (consume_stock(mem)) {
                        /*
-                         * There is a small race that "from" or "to" can be
+                         * It seems dagerous to access memcg without css_get().
-                         * freed by rmdir, so we use css_tryget().
+                         * But considering how consume_stok works, it's not
+                         * necessary. If consume_stock success, some charges
+                         * from this memcg are cached on this cpu. So, we
+                         * don't need to call css_get()/css_tryget() before
+                         * calling consume_stock().
                         */
-                        from = mc.from;
+                        rcu_read_unlock();
-                        to = mc.to;
+                        goto done;
-                        if (from && css_tryget(&from->css)) {
+                }
-                                if (mem_over_limit->use_hierarchy)
+                /* after here, we may be blocked. we need to get refcnt */
-                                        do_continue = css_is_ancestor(
+                if (!css_tryget(&mem->css)) {
-                                                        &from->css,
+                        rcu_read_unlock();
-                                                        &mem_over_limit->css);
+                        goto again;
-                                else
+                }
-                                        do_continue = (from == mem_over_limit);
+                rcu_read_unlock();
-                                css_put(&from->css);
+        }
-                        }
-                        if (!do_continue && to && css_tryget(&to->css)) {
+        do {
-                                if (mem_over_limit->use_hierarchy)
+                bool oom_check;
-                                        do_continue = css_is_ancestor(
-                                                        &to->css,
+                /* If killed, bypass charge */
-                                                        &mem_over_limit->css);
+                if (fatal_signal_pending(current)) {
-                                else
+                        css_put(&mem->css);
-                                        do_continue = (to == mem_over_limit);
+                        goto bypass;
-                                css_put(&to->css);
-                        }
-                        if (do_continue) {
-                                DEFINE_WAIT(wait);
-                                prepare_to_wait(&mc.waitq, &wait,
-                                                        TASK_INTERRUPTIBLE);
-                                /* moving charge context might have finished. */
-                                if (mc.moving_task)
-                                        schedule();
-                                finish_wait(&mc.waitq, &wait);
-                                continue;
-                        }
                }
-                if (!nr_retries--) {
+                oom_check = false;
-                        if (!oom)
+                if (oom && !nr_oom_retries) {
+                        oom_check = true;
+                        nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+                }
+                ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
+                switch (ret) {
+                case CHARGE_OK:
+                        break;
+                case CHARGE_RETRY: /* not in OOM situation but retry */
+                        csize = PAGE_SIZE;
+                        css_put(&mem->css);
+                        mem = NULL;
+                        goto again;
+                case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
+                        css_put(&mem->css);
+                        goto nomem;
+                case CHARGE_NOMEM: /* OOM routine works */
+                        if (!oom) {
+                                css_put(&mem->css);
                                goto nomem;
-                        if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
-                                nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-                                continue;
                        }
-                        /* When we reach here, current task is dying .*/
+                        /* If oom, we never return -ENOMEM */
+                        nr_oom_retries--;
+                        break;
+                case CHARGE_OOM_DIE: /* Killed by OOM Killer */
                        css_put(&mem->css);
                        goto bypass;
                }
-        }
+        } while (ret != CHARGE_OK);
        if (csize > PAGE_SIZE)
                refill_stock(mem, csize - PAGE_SIZE);
+        css_put(&mem->css);
 done:
+        *memcg = mem;
        return 0;
 nomem:
-        css_put(&mem->css);
+        *memcg = NULL;
        return -ENOMEM;
 bypass:
        *memcg = NULL;
@@ -1740,11 +1833,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
                res_counter_uncharge(&mem->res, PAGE_SIZE * count);
                if (do_swap_account)
                        res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
-                VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
-                WARN_ON_ONCE(count > INT_MAX);
-                __css_put(&mem->css, (int)count);
        }
-        /* we don't need css_put for root */
 }
 static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
@@ -1972,10 +2061,9 @@ out:
 * < 0 if the cgroup is over its limit
 */
 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
-                                gfp_t gfp_mask, enum charge_type ctype,
+                                gfp_t gfp_mask, enum charge_type ctype)
-                                struct mem_cgroup *memcg)
 {
-        struct mem_cgroup *mem;
+        struct mem_cgroup *mem = NULL;
        struct page_cgroup *pc;
        int ret;
@@ -1985,7 +2073,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                return 0;
        prefetchw(pc);
-        mem = memcg;
        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
        if (ret || !mem)
                return ret;
@@ -2013,7 +2100,7 @@ int mem_cgroup_newpage_charge(struct page *page,
        if (unlikely(!mm))
                mm = &init_mm;
        return mem_cgroup_charge_common(page, mm, gfp_mask,
-                                MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
+                                MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 static void
@@ -2023,7 +2110,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask)
 {
-        struct mem_cgroup *mem = NULL;
        int ret;
        if (mem_cgroup_disabled())
@@ -2044,7 +2130,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
        if (!(gfp_mask & __GFP_WAIT)) {
                struct page_cgroup *pc;
                pc = lookup_page_cgroup(page);
                if (!pc)
                        return 0;
@@ -2056,22 +2141,24 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                unlock_page_cgroup(pc);
        }
-        if (unlikely(!mm && !mem))
+        if (unlikely(!mm))
                mm = &init_mm;
        if (page_is_file_cache(page))
                return mem_cgroup_charge_common(page, mm, gfp_mask,
-                                MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
+                                MEM_CGROUP_CHARGE_TYPE_CACHE);
        /* shmem */
        if (PageSwapCache(page)) {
+                struct mem_cgroup *mem = NULL;
                ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
                if (!ret)
                        __mem_cgroup_commit_charge_swapin(page, mem,
                                        MEM_CGROUP_CHARGE_TYPE_SHMEM);
        } else
                ret = mem_cgroup_charge_common(page, mm, gfp_mask,
-                                        MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
+                                        MEM_CGROUP_CHARGE_TYPE_SHMEM);
        return ret;
 }
@@ -2107,7 +2194,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
                goto charge_cur_mm;
        *ptr = mem;
        ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
-        /* drop extra refcnt from tryget */
        css_put(&mem->css);
        return ret;
 charge_cur_mm:
@@ -2238,7 +2324,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
        struct page_cgroup *pc;
        struct mem_cgroup *mem = NULL;
-        struct mem_cgroup_per_zone *mz;
        if (mem_cgroup_disabled())
                return NULL;
@@ -2278,10 +2363,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
-        if (!mem_cgroup_is_root(mem))
-                __do_uncharge(mem, ctype);
-        if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
-                mem_cgroup_swap_statistics(mem, true);
        mem_cgroup_charge_statistics(mem, pc, false);
        ClearPageCgroupUsed(pc);
@@ -2292,13 +2373,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
         * special functions.
         */
-        mz = page_cgroup_zoneinfo(pc);
        unlock_page_cgroup(pc);
+        /*
+         * even after unlock, we have mem->res.usage here and this memcg
+         * will never be freed.
+         */
        memcg_check_events(mem, page);
-        /* at swapout, this memcg will be accessed to record to swap */
+        if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
-        if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
+                mem_cgroup_swap_statistics(mem, true);
-                css_put(&mem->css);
+                mem_cgroup_get(mem);
+        }
+        if (!mem_cgroup_is_root(mem))
+                __do_uncharge(mem, ctype);
        return mem;
@@ -2385,13 +2471,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
        memcg = __mem_cgroup_uncharge_common(page, ctype);
-        /* record memcg information */
+        /*
-        if (do_swap_account && swapout && memcg) {
+         * record memcg information,  if swapout && memcg != NULL,
+         * mem_cgroup_get() was called in uncharge().
+         */
+        if (do_swap_account && swapout && memcg)
                swap_cgroup_record(ent, css_id(&memcg->css));
-                mem_cgroup_get(memcg);
-        }
-        if (swapout && memcg)
-                css_put(&memcg->css);
 }
 #endif
@@ -2469,7 +2554,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
                         */
                        if (!mem_cgroup_is_root(to))
                                res_counter_uncharge(&to->res, PAGE_SIZE);
-                        css_put(&to->css);
                }
                return 0;
        }
@@ -2604,11 +2688,8 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
        ClearPageCgroupMigration(pc);
        unlock_page_cgroup(pc);
-        if (unused != oldpage)
-                pc = lookup_page_cgroup(unused);
        __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
-        pc = lookup_page_cgroup(used);
        /*
         * If a page is a file cache, radix-tree replacement is very atomic
         * and we can skip this check. When it was an Anon page, its mapcount
@@ -2784,8 +2865,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 }
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
-                                                gfp_t gfp_mask, int nid,
+                                            gfp_t gfp_mask)
-                                                int zid)
 {
        unsigned long nr_reclaimed = 0;
        struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -2797,7 +2877,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
        if (order > 0)
                return 0;
-        mctz = soft_limit_tree_node_zone(nid, zid);
+        mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
        /*
         * This loop can run a while, specially if mem_cgroup's continuously
         * keep exceeding their soft limit and putting the system under
@@ -3752,8 +3832,6 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
        return 0;
 }
-/*
- */
 static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
        struct cftype *cft, u64 val)
 {
@@ -4173,9 +4251,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
                        goto one_by_one;
                }
                mc.precharge += count;
-                VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
-                WARN_ON_ONCE(count > INT_MAX);
-                __css_get(&mem->css, (int)count);
                return ret;
        }
 one_by_one:
@@ -4393,11 +4468,13 @@ static int mem_cgroup_precharge_mc(struct mm_struct *mm)
 static void mem_cgroup_clear_mc(void)
 {
+        struct mem_cgroup *from = mc.from;
+        struct mem_cgroup *to = mc.to;
        /* we must uncharge all the leftover precharges from mc.to */
        if (mc.precharge) {
                __mem_cgroup_cancel_charge(mc.to, mc.precharge);
                mc.precharge = 0;
-                memcg_oom_recover(mc.to);
        }
        /*
         * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@@ -4406,11 +4483,9 @@ static void mem_cgroup_clear_mc(void)
        if (mc.moved_charge) {
                __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
                mc.moved_charge = 0;
-                memcg_oom_recover(mc.from);
        }
        /* we must fixup refcnts and charges */
        if (mc.moved_swap) {
-                WARN_ON_ONCE(mc.moved_swap > INT_MAX);
                /* uncharge swap account from the old cgroup */
                if (!mem_cgroup_is_root(mc.from))
                        res_counter_uncharge(&mc.from->memsw,
@@ -4424,16 +4499,18 @@ static void mem_cgroup_clear_mc(void)
                         */
                        res_counter_uncharge(&mc.to->res,
                                                PAGE_SIZE * mc.moved_swap);
-                        VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
-                        __css_put(&mc.to->css, mc.moved_swap);
                }
                /* we've already done mem_cgroup_get(mc.to) */
                mc.moved_swap = 0;
        }
+        spin_lock(&mc.lock);
        mc.from = NULL;
        mc.to = NULL;
        mc.moving_task = NULL;
+        spin_unlock(&mc.lock);
+        memcg_oom_recover(from);
+        memcg_oom_recover(to);
        wake_up_all(&mc.waitq);
 }
@@ -4462,12 +4539,14 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                        VM_BUG_ON(mc.moved_charge);
                        VM_BUG_ON(mc.moved_swap);
                        VM_BUG_ON(mc.moving_task);
+                        spin_lock(&mc.lock);
                        mc.from = from;
                        mc.to = mem;
                        mc.precharge = 0;
                        mc.moved_charge = 0;
                        mc.moved_swap = 0;
                        mc.moving_task = current;
+                        spin_unlock(&mc.lock);
                        ret = mem_cgroup_precharge_mc(mm);
                        if (ret)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6b44e52cacaa..9c26eeca1342 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -46,6 +46,7 @@
 #include <linux/suspend.h>
 #include <linux/slab.h>
 #include <linux/swapops.h>
+#include <linux/hugetlb.h>
 #include "internal.h"
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -690,17 +691,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
 /*
 * Huge pages. Needs work.
 * Issues:
- * No rmap support so we cannot find the original mapper. In theory could walk
+ * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
- * all MMs and look for the mappings, but that would be non atomic and racy.
+ *   To narrow down kill region to one page, we need to break up pmd.
- * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
+ * - To support soft-offlining for hugepage, we need to support hugepage
- * like just walking the current process and hoping it has it mapped (that
+ *   migration.
- * should be usually true for the common "shared database cache" case)
- * Should handle free huge pages and dequeue them too, but this needs to
- * handle huge page accounting correctly.
 */
 static int me_huge_page(struct page *p, unsigned long pfn)
 {
-        return FAILED;
+        struct page *hpage = compound_head(p);
+        /*
+         * We can safely recover from error on free or reserved (i.e.
+         * not in-use) hugepage by dequeuing it from freelist.
+         * To check whether a hugepage is in-use or not, we can't use
+         * page->lru because it can be used in other hugepage operations,
+         * such as __unmap_hugepage_range() and gather_surplus_pages().
+         * So instead we use page_mapping() and PageAnon().
+         * We assume that this function is called with page lock held,
+         * so there is no race between isolation and mapping/unmapping.
+         */
+        if (!(page_mapping(hpage) || PageAnon(hpage))) {
+                __isolate_hwpoisoned_huge_page(hpage);
+                return RECOVERED;
+        }
+        return DELAYED;
 }
 /*
@@ -838,6 +851,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        int ret;
        int i;
        int kill = 1;
+        struct page *hpage = compound_head(p);
        if (PageReserved(p) || PageSlab(p))
                return SWAP_SUCCESS;
@@ -846,10 +860,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * This check implies we don't kill processes if their pages
         * are in the swap cache early. Those are always late kills.
         */
-        if (!page_mapped(p))
+        if (!page_mapped(hpage))
                return SWAP_SUCCESS;
-        if (PageCompound(p) || PageKsm(p))
+        if (PageKsm(p))
                return SWAP_FAIL;
        if (PageSwapCache(p)) {
@@ -864,10 +878,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * XXX: the dirty test could be racy: set_page_dirty() may not always
         * be called inside page lock (it's recommended but not enforced).
         */
-        mapping = page_mapping(p);
+        mapping = page_mapping(hpage);
-        if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
+        if (!PageDirty(hpage) && mapping &&
-                if (page_mkclean(p)) {
+            mapping_cap_writeback_dirty(mapping)) {
-                        SetPageDirty(p);
+                if (page_mkclean(hpage)) {
+                        SetPageDirty(hpage);
                } else {
                        kill = 0;
                        ttu |= TTU_IGNORE_HWPOISON;
@@ -886,14 +901,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * there's nothing that can be done.
         */
        if (kill)
-                collect_procs(p, &tokill);
+                collect_procs(hpage, &tokill);
        /*
         * try_to_unmap can fail temporarily due to races.
         * Try a few times (RED-PEN better strategy?)
         */
        for (i = 0; i < N_UNMAP_TRIES; i++) {
-                ret = try_to_unmap(p, ttu);
+                ret = try_to_unmap(hpage, ttu);
                if (ret == SWAP_SUCCESS)
                        break;
                pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn,  ret);
@@ -901,7 +916,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        if (ret != SWAP_SUCCESS)
                printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
-                                pfn, page_mapcount(p));
+                                pfn, page_mapcount(hpage));
        /*
         * Now that the dirty bit has been propagated to the
@@ -912,17 +927,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * use a more force-full uncatchable kill to prevent
         * any accesses to the poisoned memory.
         */
-        kill_procs_ao(&tokill, !!PageDirty(p), trapno,
+        kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
                      ret != SWAP_SUCCESS, pfn);
        return ret;
 }
+static void set_page_hwpoison_huge_page(struct page *hpage)
+{
+        int i;
+        int nr_pages = 1 << compound_order(hpage);
+        for (i = 0; i < nr_pages; i++)
+                SetPageHWPoison(hpage + i);
+}
+static void clear_page_hwpoison_huge_page(struct page *hpage)
+{
+        int i;
+        int nr_pages = 1 << compound_order(hpage);
+        for (i = 0; i < nr_pages; i++)
+                ClearPageHWPoison(hpage + i);
+}
 int __memory_failure(unsigned long pfn, int trapno, int flags)
 {
        struct page_state *ps;
        struct page *p;
+        struct page *hpage;
        int res;
+        unsigned int nr_pages;
        if (!sysctl_memory_failure_recovery)
                panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -935,12 +968,14 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
        }
        p = pfn_to_page(pfn);
+        hpage = compound_head(p);
        if (TestSetPageHWPoison(p)) {
                printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
                return 0;
        }
-        atomic_long_add(1, &mce_bad_pages);
+        nr_pages = 1 << compound_order(hpage);
+        atomic_long_add(nr_pages, &mce_bad_pages);
        /*
         * We need/can do nothing about count=0 pages.
@@ -954,7 +989,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
         */
        if (!(flags & MF_COUNT_INCREASED) &&
-                !get_page_unless_zero(compound_head(p))) {
+                !get_page_unless_zero(hpage)) {
                if (is_free_buddy_page(p)) {
                        action_result(pfn, "free buddy", DELAYED);
                        return 0;
@@ -972,9 +1007,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * The check (unnecessarily) ignores LRU pages being isolated and
         * walked by the page reclaim code, however that's not a big loss.
         */
-        if (!PageLRU(p))
+        if (!PageLRU(p) && !PageHuge(p))
                shake_page(p, 0);
-        if (!PageLRU(p)) {
+        if (!PageLRU(p) && !PageHuge(p)) {
                /*
                 * shake_page could have turned it free.
                 */
@@ -992,7 +1027,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * It's very difficult to mess with pages currently under IO
         * and in many cases impossible, so we just avoid it here.
         */
-        lock_page_nosync(p);
+        lock_page_nosync(hpage);
        /*
         * unpoison always clear PG_hwpoison inside page lock
@@ -1004,11 +1039,31 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
        }
        if (hwpoison_filter(p)) {
                if (TestClearPageHWPoison(p))
-                        atomic_long_dec(&mce_bad_pages);
+                        atomic_long_sub(nr_pages, &mce_bad_pages);
-                unlock_page(p);
+                unlock_page(hpage);
-                put_page(p);
+                put_page(hpage);
+                return 0;
+        }
+        /*
+         * For error on the tail page, we should set PG_hwpoison
+         * on the head page to show that the hugepage is hwpoisoned
+         */
+        if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+                action_result(pfn, "hugepage already hardware poisoned",
+                                IGNORED);
+                unlock_page(hpage);
+                put_page(hpage);
                return 0;
        }
+        /*
+         * Set PG_hwpoison on all pages in an error hugepage,
+         * because containment is done in hugepage unit for now.
+         * Since we have done TestSetPageHWPoison() for the head page with
+         * page lock held, we can safely set PG_hwpoison bits on tail pages.
+         */
+        if (PageHuge(p))
+                set_page_hwpoison_huge_page(hpage);
        wait_on_page_writeback(p);
@@ -1039,7 +1094,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
                }
        }
 out:
-        unlock_page(p);
+        unlock_page(hpage);
        return res;
 }
 EXPORT_SYMBOL_GPL(__memory_failure);
@@ -1083,6 +1138,7 @@ int unpoison_memory(unsigned long pfn)
        struct page *page;
        struct page *p;
        int freeit = 0;
+        unsigned int nr_pages;
        if (!pfn_valid(pfn))
                return -ENXIO;
@@ -1095,9 +1151,11 @@ int unpoison_memory(unsigned long pfn)
                return 0;
        }
+        nr_pages = 1 << compound_order(page);
        if (!get_page_unless_zero(page)) {
                if (TestClearPageHWPoison(p))
-                        atomic_long_dec(&mce_bad_pages);
+                        atomic_long_sub(nr_pages, &mce_bad_pages);
                pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
                return 0;
        }
@@ -1109,11 +1167,13 @@ int unpoison_memory(unsigned long pfn)
         * the PG_hwpoison page will be caught and isolated on the entrance to
         * the free buddy page pool.
         */
-        if (TestClearPageHWPoison(p)) {
+        if (TestClearPageHWPoison(page)) {
                pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
-                atomic_long_dec(&mce_bad_pages);
+                atomic_long_sub(nr_pages, &mce_bad_pages);
                freeit = 1;
        }
+        if (PageHuge(p))
+                clear_page_hwpoison_huge_page(page);
        unlock_page(page);
        put_page(page);
diff --git a/mm/memory.c b/mm/memory.c
index 858829d06a92..9b3b73f4ae9c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2760,6 +2760,26 @@ out_release:
 }
 /*
+ * This is like a special single-page "expand_downwards()",
+ * except we must first make sure that 'address-PAGE_SIZE'
+ * doesn't hit another vma.
+ *
+ * The "find_vma()" will do the right thing even if we wrap
+ */
+static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
+{
+        address &= PAGE_MASK;
+        if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
+                address -= PAGE_SIZE;
+                if (find_vma(vma->vm_mm, address) != vma)
+                        return -ENOMEM;
+                expand_stack(vma, address);
+        }
+        return 0;
+}
+/*
 * We enter with non-exclusive mmap_sem (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -2772,6 +2792,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        spinlock_t *ptl;
        pte_t entry;
+        if (check_stack_guard_page(vma, address) < 0) {
+                pte_unmap(page_table);
+                return VM_FAULT_SIGBUS;
+        }
        if (!(flags & FAULT_FLAG_WRITE)) {
                entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
                                                vma->vm_page_prot));
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d3def05a33d9..5014e50644d1 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -106,7 +106,7 @@ static void boost_dying_task_prio(struct task_struct *p,
 * pointer.  Return p, or any of its subthreads with a valid ->mm, with
 * task_lock() held.
 */
-static struct task_struct *find_lock_task_mm(struct task_struct *p)
+struct task_struct *find_lock_task_mm(struct task_struct *p)
 {
        struct task_struct *t = p;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0c6258bd1ba3..20890d80c7ef 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -253,32 +253,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
        }
 }
-/*
- * Clip the earned share of dirty pages to that which is actually available.
- * This avoids exceeding the total dirty_limit when the floating averages
- * fluctuate too quickly.
- */
-static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
-                unsigned long dirty, unsigned long *pbdi_dirty)
-{
-        unsigned long avail_dirty;
-        avail_dirty = global_page_state(NR_FILE_DIRTY) +
-                 global_page_state(NR_WRITEBACK) +
-                 global_page_state(NR_UNSTABLE_NFS) +
-                 global_page_state(NR_WRITEBACK_TEMP);
-        if (avail_dirty < dirty)
-                avail_dirty = dirty - avail_dirty;
-        else
-                avail_dirty = 0;
-        avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
-                bdi_stat(bdi, BDI_WRITEBACK);
-        *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
-}
 static inline void task_dirties_fraction(struct task_struct *tsk,
                long *numerator, long *denominator)
 {
@@ -287,16 +261,24 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
 }
 /*
- * scale the dirty limit
+ * task_dirty_limit - scale down dirty throttling threshold for one task
 *
 * task specific dirty limit:
 *
 *   dirty -= (dirty/8) * p_{t}
+ *
+ * To protect light/slow dirtying tasks from heavier/fast ones, we start
+ * throttling individual tasks before reaching the bdi dirty limit.
+ * Relatively low thresholds will be allocated to heavy dirtiers. So when
+ * dirty pages grow large, heavy dirtiers will be throttled first, which will
+ * effectively curb the growth of dirty pages. Light dirtiers with high enough
+ * dirty threshold may never get throttled.
 */
-static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
+static unsigned long task_dirty_limit(struct task_struct *tsk,
+                                       unsigned long bdi_dirty)
 {
        long numerator, denominator;
-        unsigned long dirty = *pdirty;
+        unsigned long dirty = bdi_dirty;
        u64 inv = dirty >> 3;
        task_dirties_fraction(tsk, &numerator, &denominator);
@@ -304,10 +286,8 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
        do_div(inv, denominator);
        dirty -= inv;
-        if (dirty < *pdirty/2)
-                dirty = *pdirty/2;
-        *pdirty = dirty;
+        return max(dirty, bdi_dirty/2);
 }
 /*
@@ -417,9 +397,16 @@ unsigned long determine_dirtyable_memory(void)
        return x + 1;   /* Ensure that we never return 0 */
 }
-void
+/**
-get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
-                 unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
+ *
+ * Calculate the dirty thresholds based on sysctl parameters
+ * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
+ * - vm.dirty_ratio             or  vm.dirty_bytes
+ * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * runtime tasks.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 {
        unsigned long background;
        unsigned long dirty;
@@ -451,27 +438,37 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
        }
        *pbackground = background;
        *pdirty = dirty;
+}
+/**
+ * bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ *
+ * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * - starving fast devices
+ * - piling up dirty pages (that will take long time to sync) on slow devices
+ *
+ * The bdi's share of dirty limit will be adapting to its throughput and
+ * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
+ */
+unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
+{
+        u64 bdi_dirty;
+        long numerator, denominator;
+        /*
+         * Calculate this BDI's share of the dirty ratio.
+         */
+        bdi_writeout_fraction(bdi, &numerator, &denominator);
-        if (bdi) {
+        bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
-                u64 bdi_dirty;
+        bdi_dirty *= numerator;
-                long numerator, denominator;
+        do_div(bdi_dirty, denominator);
-                /*
+        bdi_dirty += (dirty * bdi->min_ratio) / 100;
-                 * Calculate this BDI's share of the dirty ratio.
+        if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
-                 */
+                bdi_dirty = dirty * bdi->max_ratio / 100;
-                bdi_writeout_fraction(bdi, &numerator, &denominator);
+        return bdi_dirty;
-                bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
-                bdi_dirty *= numerator;
-                do_div(bdi_dirty, denominator);
-                bdi_dirty += (dirty * bdi->min_ratio) / 100;
-                if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
-                        bdi_dirty = dirty * bdi->max_ratio / 100;
-                *pbdi_dirty = bdi_dirty;
-                clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
-                task_dirty_limit(current, pbdi_dirty);
-        }
 }
 /*
@@ -491,7 +488,7 @@ static void balance_dirty_pages(struct address_space *mapping,
        unsigned long bdi_thresh;
        unsigned long pages_written = 0;
        unsigned long pause = 1;
+        bool dirty_exceeded = false;
        struct backing_dev_info *bdi = mapping->backing_dev_info;
        for (;;) {
@@ -502,18 +499,11 @@ static void balance_dirty_pages(struct address_space *mapping,
                        .range_cyclic   = 1,
                };
-                get_dirty_limits(&background_thresh, &dirty_thresh,
-                                &bdi_thresh, bdi);
                nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
                                        global_page_state(NR_UNSTABLE_NFS);
                nr_writeback = global_page_state(NR_WRITEBACK);
-                bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+                global_dirty_limits(&background_thresh, &dirty_thresh);
-                bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
-                if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
-                        break;
                /*
                 * Throttle it only when the background writeback cannot
@@ -524,26 +514,8 @@ static void balance_dirty_pages(struct address_space *mapping,
                                (background_thresh + dirty_thresh) / 2)
                        break;
-                if (!bdi->dirty_exceeded)
+                bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-                        bdi->dirty_exceeded = 1;
+                bdi_thresh = task_dirty_limit(current, bdi_thresh);
-                /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
-                 * Unstable writes are a feature of certain networked
-                 * filesystems (i.e. NFS) in which data may have been
-                 * written to the server's write cache, but has not yet
-                 * been flushed to permanent storage.
-                 * Only move pages to writeback if this bdi is over its
-                 * threshold otherwise wait until the disk writes catch
-                 * up.
-                 */
-                trace_wbc_balance_dirty_start(&wbc, bdi);
-                if (bdi_nr_reclaimable > bdi_thresh) {
-                        writeback_inodes_wb(&bdi->wb, &wbc);
-                        pages_written += write_chunk - wbc.nr_to_write;
-                        get_dirty_limits(&background_thresh, &dirty_thresh,
-                                       &bdi_thresh, bdi);
-                        trace_wbc_balance_dirty_written(&wbc, bdi);
-                }
                /*
                 * In order to avoid the stacked BDI deadlock we need
@@ -558,16 +530,44 @@ static void balance_dirty_pages(struct address_space *mapping,
                if (bdi_thresh < 2*bdi_stat_error(bdi)) {
                        bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
                        bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
-                } else if (bdi_nr_reclaimable) {
+                } else {
                        bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
                        bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
                }
-                if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+                /*
+                 * The bdi thresh is somehow "soft" limit derived from the
+                 * global "hard" limit. The former helps to prevent heavy IO
+                 * bdi or process from holding back light ones; The latter is
+                 * the last resort safeguard.
+                 */
+                dirty_exceeded =
+                        (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
+                        || (nr_reclaimable + nr_writeback >= dirty_thresh);
+                if (!dirty_exceeded)
                        break;
-                if (pages_written >= write_chunk)
-                        break;          /* We've done our duty */
+                if (!bdi->dirty_exceeded)
+                        bdi->dirty_exceeded = 1;
+                /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
+                 * Unstable writes are a feature of certain networked
+                 * filesystems (i.e. NFS) in which data may have been
+                 * written to the server's write cache, but has not yet
+                 * been flushed to permanent storage.
+                 * Only move pages to writeback if this bdi is over its
+                 * threshold otherwise wait until the disk writes catch
+                 * up.
+                 */
+                trace_wbc_balance_dirty_start(&wbc, bdi);
+                if (bdi_nr_reclaimable > bdi_thresh) {
+                        writeback_inodes_wb(&bdi->wb, &wbc);
+                        pages_written += write_chunk - wbc.nr_to_write;
+                        trace_wbc_balance_dirty_written(&wbc, bdi);
+                        if (pages_written >= write_chunk)
+                                break;          /* We've done our duty */
+                }
                trace_wbc_balance_dirty_wait(&wbc, bdi);
                __set_current_state(TASK_INTERRUPTIBLE);
                io_schedule_timeout(pause);
@@ -581,8 +581,7 @@ static void balance_dirty_pages(struct address_space *mapping,
                        pause = HZ / 10;
        }
-        if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
+        if (!dirty_exceeded && bdi->dirty_exceeded)
-                        bdi->dirty_exceeded)
                bdi->dirty_exceeded = 0;
        if (writeback_in_progress(bdi))
@@ -597,9 +596,7 @@ static void balance_dirty_pages(struct address_space *mapping,
         * background_thresh, to keep the amount of dirty memory low.
         */
        if ((laptop_mode && pages_written) ||
-            (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
+            (!laptop_mode && (nr_reclaimable > background_thresh)))
-                               + global_page_state(NR_UNSTABLE_NFS))
-                                          > background_thresh)))
                bdi_start_background_writeback(bdi);
 }
@@ -663,7 +660,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
        unsigned long dirty_thresh;
        for ( ; ; ) {
-                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+                global_dirty_limits(&background_thresh, &dirty_thresh);
                /*
                 * Boost the allowable dirty threshold a bit for page
@@ -825,10 +822,10 @@ void __init page_writeback_init(void)
 /*
 * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
 */
-#define WRITEBACK_TAG_BATCH 4096
 void tag_pages_for_writeback(struct address_space *mapping,
                             pgoff_t start, pgoff_t end)
 {
+#define WRITEBACK_TAG_BATCH 4096
        unsigned long tagged;
        do {
diff --git a/mm/rmap.c b/mm/rmap.c
index a7d0f5482634..87b9e8ad4509 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
+#include <linux/hugetlb.h>
 #include <asm/tlbflush.h>
@@ -350,6 +351,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        unsigned long address;
+        if (unlikely(is_vm_hugetlb_page(vma)))
+                pgoff = page->index << huge_page_order(page_hstate(page));
        address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
        if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
                /* page should be within @vma mapping range */
@@ -394,6 +397,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
        pte_t *pte;
        spinlock_t *ptl;
+        if (unlikely(PageHuge(page))) {
+                pte = huge_pte_offset(mm, address);
+                ptl = &mm->page_table_lock;
+                goto check;
+        }
        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
                return NULL;
@@ -414,6 +423,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
        }
        ptl = pte_lockptr(mm, pmd);
+check:
        spin_lock(ptl);
        if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
                *ptlp = ptl;
@@ -916,6 +926,12 @@ void page_remove_rmap(struct page *page)
                page_clear_dirty(page);
                set_page_dirty(page);
        }
+        /*
+         * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
+         * and not charged by memcg for now.
+         */
+        if (unlikely(PageHuge(page)))
+                return;
        if (PageAnon(page)) {
                mem_cgroup_uncharge_page(page);
                __dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1524,3 +1540,46 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
                return rmap_walk_file(page, rmap_one, arg);
 }
 #endif /* CONFIG_MIGRATION */
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * The following three functions are for anonymous (private mapped) hugepages.
+ * Unlike common anonymous pages, anonymous hugepages have no accounting code
+ * and no lru code, because we handle hugepages differently from common pages.
+ */
+static void __hugepage_set_anon_rmap(struct page *page,
+        struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
+        struct anon_vma *anon_vma = vma->anon_vma;
+        BUG_ON(!anon_vma);
+        if (!exclusive) {
+                struct anon_vma_chain *avc;
+                avc = list_entry(vma->anon_vma_chain.prev,
+                                 struct anon_vma_chain, same_vma);
+                anon_vma = avc->anon_vma;
+        }
+        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+        page->mapping = (struct address_space *) anon_vma;
+        page->index = linear_page_index(vma, address);
+}
+void hugepage_add_anon_rmap(struct page *page,
+                            struct vm_area_struct *vma, unsigned long address)
+{
+        struct anon_vma *anon_vma = vma->anon_vma;
+        int first;
+        BUG_ON(!anon_vma);
+        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+        first = atomic_inc_and_test(&page->_mapcount);
+        if (first)
+                __hugepage_set_anon_rmap(page, vma, address, 0);
+}
+void hugepage_add_new_anon_rmap(struct page *page,
+                        struct vm_area_struct *vma, unsigned long address)
+{
+        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+        atomic_set(&page->_mapcount, 0);
+        __hugepage_set_anon_rmap(page, vma, address, 1);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 918c51335d64..6b8889da69a6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,6 +31,7 @@
 #include <asm/tlbflush.h>
 #include <asm/shmparam.h>
+bool vmap_lazy_unmap __read_mostly = true;
 /*** Page table manipulation functions ***/
@@ -502,6 +503,9 @@ static unsigned long lazy_max_pages(void)
 {
        unsigned int log;
+        if (!vmap_lazy_unmap)
+                return 0;
        log = fls(num_online_cpus());
        return log * (32UL * 1024 * 1024 / PAGE_SIZE);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ec5ddccbf82e..c391c320dbaf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1969,9 +1969,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                                                gfp_t gfp_mask, bool noswap,
                                                unsigned int swappiness,
-                                                struct zone *zone, int nid)
+                                                struct zone *zone)
 {
        struct scan_control sc = {
+                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = !noswap,
@@ -1979,13 +1980,8 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                .order = 0,
                .mem_cgroup = mem,
        };
-        nodemask_t nm  = nodemask_of_node(nid);
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
-        sc.nodemask = &nm;
-        sc.nr_reclaimed = 0;
-        sc.nr_scanned = 0;
        trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
                                                      sc.may_writepage,
@@ -2172,7 +2168,6 @@ loop_again:
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
                        int nr_slab;
-                        int nid, zid;
                        if (!populated_zone(zone))
                                continue;
@@ -2182,14 +2177,12 @@ loop_again:
                        sc.nr_scanned = 0;
-                        nid = pgdat->node_id;
-                        zid = zone_idx(zone);
                        /*
                         * Call soft limit reclaim before calling shrink_zone.
                         * For now we ignore the return value
                         */
-                        mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
+                        mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
-                                                        nid, zid);
                        /*
                         * We put equal pressure on every zone, unless one
                         * zone has way too many pages free already.