1 files changed, 172 insertions, 94 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8ab841031436..da53a252b259 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -600,23 +600,24 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
 }
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
-                                         struct page_cgroup *pc,
+                                         bool file, int nr_pages)
-                                         bool charge)
 {
-        int val = (charge) ? 1 : -1;
        preempt_disable();
-        if (PageCgroupCache(pc))
+        if (file)
-                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
+                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
        else
-                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
+                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
-        if (charge)
+        /* pagein of a big page is an event. So, ignore page size */
+        if (nr_pages > 0)
                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
-        else
+        else {
                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
-        __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
+                nr_pages = -nr_pages; /* for event */
+        }
+        __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
        preempt_enable();
 }
@@ -815,7 +816,8 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
         * removed from global LRU.
         */
        mz = page_cgroup_zoneinfo(pc);
-        MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+        /* huge page split is done under lru_lock. so, we have no races. */
+        MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
        if (mem_cgroup_is_root(pc->mem_cgroup))
                return;
        VM_BUG_ON(list_empty(&pc->lru));
@@ -836,13 +838,12 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
                return;
        pc = lookup_page_cgroup(page);
-        /*
-         * Used bit is set without atomic ops but after smp_wmb().
-         * For making pc->mem_cgroup visible, insert smp_rmb() here.
-         */
-        smp_rmb();
        /* unused or root page is not rotated. */
-        if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
+        if (!PageCgroupUsed(pc))
+                return;
+        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+        smp_rmb();
+        if (mem_cgroup_is_root(pc->mem_cgroup))
                return;
        mz = page_cgroup_zoneinfo(pc);
        list_move(&pc->lru, &mz->lists[lru]);
@@ -857,16 +858,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
                return;
        pc = lookup_page_cgroup(page);
        VM_BUG_ON(PageCgroupAcctLRU(pc));
-        /*
-         * Used bit is set without atomic ops but after smp_wmb().
-         * For making pc->mem_cgroup visible, insert smp_rmb() here.
-         */
-        smp_rmb();
        if (!PageCgroupUsed(pc))
                return;
+        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+        smp_rmb();
        mz = page_cgroup_zoneinfo(pc);
-        MEM_CGROUP_ZSTAT(mz, lru) += 1;
+        /* huge page split is done under lru_lock. so, we have no races. */
+        MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
        SetPageCgroupAcctLRU(pc);
        if (mem_cgroup_is_root(pc->mem_cgroup))
                return;
@@ -1030,14 +1028,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
                return NULL;
        pc = lookup_page_cgroup(page);
-        /*
-         * Used bit is set without atomic ops but after smp_wmb().
-         * For making pc->mem_cgroup visible, insert smp_rmb() here.
-         */
-        smp_rmb();
        if (!PageCgroupUsed(pc))
                return NULL;
+        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+        smp_rmb();
        mz = page_cgroup_zoneinfo(pc);
        if (!mz)
                return NULL;
@@ -1119,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
        return false;
 }
+/**
+ * mem_cgroup_check_margin - check if the memory cgroup allows charging
+ * @mem: memory cgroup to check
+ * @bytes: the number of bytes the caller intends to charge
+ *
+ * Returns a boolean value on whether @mem can be charged @bytes or
+ * whether this would exceed the limit.
+ */
+static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
+{
+        if (!res_counter_check_margin(&mem->res, bytes))
+                return false;
+        if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
+                return false;
+        return true;
+}
 static unsigned int get_swappiness(struct mem_cgroup *memcg)
 {
        struct cgroup *cgrp = memcg->css.cgroup;
@@ -1615,7 +1626,7 @@ void mem_cgroup_update_page_stat(struct page *page,
        if (unlikely(!mem || !PageCgroupUsed(pc)))
                goto out;
        /* pc->mem_cgroup is unstable ? */
-        if (unlikely(mem_cgroup_stealed(mem))) {
+        if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
                /* take a lock against to access pc->mem_cgroup */
                move_lock_page_cgroup(pc, &flags);
                need_unlock = true;
@@ -1840,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
                if (likely(!ret))
                        return CHARGE_OK;
+                res_counter_uncharge(&mem->res, csize);
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
        } else
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+        /*
-        if (csize > PAGE_SIZE) /* change csize and retry */
+         * csize can be either a huge page (HPAGE_SIZE), a batch of
+         * regular pages (CHARGE_SIZE), or a single regular page
+         * (PAGE_SIZE).
+         *
+         * Never reclaim on behalf of optional batching, retry with a
+         * single page instead.
+         */
+        if (csize == CHARGE_SIZE)
                return CHARGE_RETRY;
        if (!(gfp_mask & __GFP_WAIT))
                return CHARGE_WOULDBLOCK;
        ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
-                                        gfp_mask, flags);
+                                              gfp_mask, flags);
+        if (mem_cgroup_check_margin(mem_over_limit, csize))
+                return CHARGE_RETRY;
        /*
-         * try_to_free_mem_cgroup_pages() might not give us a full
+         * Even though the limit is exceeded at this point, reclaim
-         * picture of reclaim. Some pages are reclaimed and might be
+         * may have been able to free some pages.  Retry the charge
-         * moved to swap cache or just unmapped from the cgroup.
+         * before killing the task.
-         * Check the limit again to see if the reclaim reduced the
+         *
-         * current usage of the cgroup before giving up
+         * Only for regular pages, though: huge pages are rather
+         * unlikely to succeed so close to the limit, and we fall back
+         * to regular pages anyway in case of failure.
         */
-        if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+        if (csize == PAGE_SIZE && ret)
                return CHARGE_RETRY;
        /*
@@ -2084,14 +2107,27 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
        return mem;
 }
-/*
+static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
- * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
+                                       struct page_cgroup *pc,
- * USED state. If already USED, uncharge and return.
+                                       enum charge_type ctype,
- */
+                                       int page_size)
-static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
-                                         struct page_cgroup *pc,
-                                         enum charge_type ctype)
 {
+        int nr_pages = page_size >> PAGE_SHIFT;
+        /* try_charge() can return NULL to *memcg, taking care of it. */
+        if (!mem)
+                return;
+        lock_page_cgroup(pc);
+        if (unlikely(PageCgroupUsed(pc))) {
+                unlock_page_cgroup(pc);
+                mem_cgroup_cancel_charge(mem, page_size);
+                return;
+        }
+        /*
+         * we don't need page_cgroup_lock about tail pages, becase they are not
+         * accessed by any other context at this point.
+         */
        pc->mem_cgroup = mem;
        /*
         * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2115,43 +2151,57 @@ static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
                break;
        }
-        mem_cgroup_charge_statistics(mem, pc, true);
+        mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
+        unlock_page_cgroup(pc);
+        /*
+         * "charge_statistics" updated event counter. Then, check it.
+         * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+         * if they exceeds softlimit.
+         */
+        memcg_check_events(mem, pc->page);
 }
-static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-                                       struct page_cgroup *pc,
-                                       enum charge_type ctype,
-                                       int page_size)
-{
-        int i;
-        int count = page_size >> PAGE_SHIFT;
-        /* try_charge() can return NULL to *memcg, taking care of it. */
+#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
-        if (!mem)
+                        (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
-                return;
+/*
+ * Because tail pages are not marked as "used", set it. We're under
+ * zone->lru_lock, 'splitting on pmd' and compund_lock.
+ */
+void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
+{
+        struct page_cgroup *head_pc = lookup_page_cgroup(head);
+        struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
+        unsigned long flags;
-        lock_page_cgroup(pc);
+        if (mem_cgroup_disabled())
-        if (unlikely(PageCgroupUsed(pc))) {
-                unlock_page_cgroup(pc);
-                mem_cgroup_cancel_charge(mem, page_size);
                return;
-        }
        /*
-         * we don't need page_cgroup_lock about tail pages, becase they are not
+         * We have no races with charge/uncharge but will have races with
-         * accessed by any other context at this point.
+         * page state accounting.
         */
-        for (i = 0; i < count; i++)
+        move_lock_page_cgroup(head_pc, &flags);
-                ____mem_cgroup_commit_charge(mem, pc + i, ctype);
-        unlock_page_cgroup(pc);
+        tail_pc->mem_cgroup = head_pc->mem_cgroup;
-        /*
+        smp_wmb(); /* see __commit_charge() */
-         * "charge_statistics" updated event counter. Then, check it.
+        if (PageCgroupAcctLRU(head_pc)) {
-         * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+                enum lru_list lru;
-         * if they exceeds softlimit.
+                struct mem_cgroup_per_zone *mz;
-         */
-        memcg_check_events(mem, pc->page);
+                /*
+                 * LRU flags cannot be copied because we need to add tail
+                 *.page to LRU by generic call and our hook will be called.
+                 * We hold lru_lock, then, reduce counter directly.
+                 */
+                lru = page_lru(head);
+                mz = page_cgroup_zoneinfo(head_pc);
+                MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+        }
+        tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+        move_unlock_page_cgroup(head_pc, &flags);
 }
+#endif
 /**
 * __mem_cgroup_move_account - move account of the page
@@ -2171,8 +2221,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 */
 static void __mem_cgroup_move_account(struct page_cgroup *pc,
-        struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
+        struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge,
+        int charge_size)
 {
+        int nr_pages = charge_size >> PAGE_SHIFT;
        VM_BUG_ON(from == to);
        VM_BUG_ON(PageLRU(pc->page));
        VM_BUG_ON(!page_is_cgroup_locked(pc));
@@ -2186,14 +2239,14 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
                __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
                preempt_enable();
        }
-        mem_cgroup_charge_statistics(from, pc, false);
+        mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
        if (uncharge)
                /* This is not "cancel", but cancel_charge does all we need. */
-                mem_cgroup_cancel_charge(from, PAGE_SIZE);
+                mem_cgroup_cancel_charge(from, charge_size);
        /* caller should have done css_get */
        pc->mem_cgroup = to;
-        mem_cgroup_charge_statistics(to, pc, true);
+        mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
        /*
         * We charges against "to" which may not have any tasks. Then, "to"
         * can be under rmdir(). But in current implementation, caller of
@@ -2208,15 +2261,24 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
 * __mem_cgroup_move_account()
 */
 static int mem_cgroup_move_account(struct page_cgroup *pc,
-                struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
+                struct mem_cgroup *from, struct mem_cgroup *to,
+                bool uncharge, int charge_size)
 {
        int ret = -EINVAL;
        unsigned long flags;
+        /*
+         * The page is isolated from LRU. So, collapse function
+         * will not handle this page. But page splitting can happen.
+         * Do this check under compound_page_lock(). The caller should
+         * hold it.
+         */
+        if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
+                return -EBUSY;
        lock_page_cgroup(pc);
        if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
                move_lock_page_cgroup(pc, &flags);
-                __mem_cgroup_move_account(pc, from, to, uncharge);
+                __mem_cgroup_move_account(pc, from, to, uncharge, charge_size);
                move_unlock_page_cgroup(pc, &flags);
                ret = 0;
        }
@@ -2241,6 +2303,8 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
        struct cgroup *cg = child->css.cgroup;
        struct cgroup *pcg = cg->parent;
        struct mem_cgroup *parent;
+        int page_size = PAGE_SIZE;
+        unsigned long flags;
        int ret;
        /* Is ROOT ? */
@@ -2253,15 +2317,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
        if (isolate_lru_page(page))
                goto put;
+        if (PageTransHuge(page))
+                page_size = HPAGE_SIZE;
        parent = mem_cgroup_from_cont(pcg);
-        ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false,
+        ret = __mem_cgroup_try_charge(NULL, gfp_mask,
-                                      PAGE_SIZE);
+                                &parent, false, page_size);
        if (ret || !parent)
                goto put_back;
-        ret = mem_cgroup_move_account(pc, child, parent, true);
+        if (page_size > PAGE_SIZE)
+                flags = compound_lock_irqsave(page);
+        ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
        if (ret)
-                mem_cgroup_cancel_charge(parent, PAGE_SIZE);
+                mem_cgroup_cancel_charge(parent, page_size);
+        if (page_size > PAGE_SIZE)
+                compound_unlock_irqrestore(page, flags);
 put_back:
        putback_lru_page(page);
 put:
@@ -2280,13 +2353,19 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask, enum charge_type ctype)
 {
        struct mem_cgroup *mem = NULL;
+        int page_size = PAGE_SIZE;
        struct page_cgroup *pc;
+        bool oom = true;
        int ret;
-        int page_size = PAGE_SIZE;
        if (PageTransHuge(page)) {
                page_size <<= compound_order(page);
                VM_BUG_ON(!PageTransHuge(page));
+                /*
+                 * Never OOM-kill a process for a huge page.  The
+                 * fault handler will fall back to regular pages.
+                 */
+                oom = false;
        }
        pc = lookup_page_cgroup(page);
@@ -2295,7 +2374,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                return 0;
        prefetchw(pc);
-        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);
+        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
        if (ret || !mem)
                return ret;
@@ -2546,7 +2625,6 @@ direct_uncharge:
 static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
-        int i;
        int count;
        struct page_cgroup *pc;
        struct mem_cgroup *mem = NULL;
@@ -2596,8 +2674,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
-        for (i = 0; i < count; i++)
+        mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count);
-                mem_cgroup_charge_statistics(mem, pc + i, false);
        ClearPageCgroupUsed(pc);
        /*
@@ -4844,7 +4921,7 @@ retry:
                                goto put;
                        pc = lookup_page_cgroup(page);
                        if (!mem_cgroup_move_account(pc,
-                                                mc.from, mc.to, false)) {
+                                        mc.from, mc.to, false, PAGE_SIZE)) {
                                mc.precharge--;
                                /* we uncharge from mc.from later. */
                                mc.moved_charge++;
@@ -4983,9 +5060,9 @@ struct cgroup_subsys mem_cgroup_subsys = {
 static int __init enable_swap_account(char *s)
 {
        /* consider enabled if no parameter or 1 is given */
-        if (!s || !strcmp(s, "1"))
+        if (!(*s) || !strcmp(s, "=1"))
                really_do_swap_account = 1;
-        else if (!strcmp(s, "0"))
+        else if (!strcmp(s, "=0"))
                really_do_swap_account = 0;
        return 1;
 }
@@ -4993,7 +5070,8 @@ __setup("swapaccount", enable_swap_account);
 static int __init disable_swap_account(char *s)
 {
-        enable_swap_account("0");
+        printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
+        enable_swap_account("=0");
        return 1;
 }
 __setup("noswapaccount", disable_swap_account);