1 files changed, 120 insertions, 79 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8ab841031436..3878cfe399dc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -600,23 +600,22 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
 }
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
-                                         struct page_cgroup *pc,
+                                         bool file, int nr_pages)
-                                         bool charge)
 {
-        int val = (charge) ? 1 : -1;
        preempt_disable();
-        if (PageCgroupCache(pc))
+        if (file)
-                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
+                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
        else
-                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
+                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
-        if (charge)
+        /* pagein of a big page is an event. So, ignore page size */
+        if (nr_pages > 0)
                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
        else
                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
-        __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
+        __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
        preempt_enable();
 }
@@ -815,7 +814,8 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
         * removed from global LRU.
         */
        mz = page_cgroup_zoneinfo(pc);
-        MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+        /* huge page split is done under lru_lock. so, we have no races. */
+        MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
        if (mem_cgroup_is_root(pc->mem_cgroup))
                return;
        VM_BUG_ON(list_empty(&pc->lru));
@@ -836,13 +836,12 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
                return;
        pc = lookup_page_cgroup(page);
-        /*
-         * Used bit is set without atomic ops but after smp_wmb().
-         * For making pc->mem_cgroup visible, insert smp_rmb() here.
-         */
-        smp_rmb();
        /* unused or root page is not rotated. */
-        if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
+        if (!PageCgroupUsed(pc))
+                return;
+        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+        smp_rmb();
+        if (mem_cgroup_is_root(pc->mem_cgroup))
                return;
        mz = page_cgroup_zoneinfo(pc);
        list_move(&pc->lru, &mz->lists[lru]);
@@ -857,16 +856,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
                return;
        pc = lookup_page_cgroup(page);
        VM_BUG_ON(PageCgroupAcctLRU(pc));
-        /*
-         * Used bit is set without atomic ops but after smp_wmb().
-         * For making pc->mem_cgroup visible, insert smp_rmb() here.
-         */
-        smp_rmb();
        if (!PageCgroupUsed(pc))
                return;
+        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+        smp_rmb();
        mz = page_cgroup_zoneinfo(pc);
-        MEM_CGROUP_ZSTAT(mz, lru) += 1;
+        /* huge page split is done under lru_lock. so, we have no races. */
+        MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
        SetPageCgroupAcctLRU(pc);
        if (mem_cgroup_is_root(pc->mem_cgroup))
                return;
@@ -1030,14 +1026,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
                return NULL;
        pc = lookup_page_cgroup(page);
-        /*
-         * Used bit is set without atomic ops but after smp_wmb().
-         * For making pc->mem_cgroup visible, insert smp_rmb() here.
-         */
-        smp_rmb();
        if (!PageCgroupUsed(pc))
                return NULL;
+        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+        smp_rmb();
        mz = page_cgroup_zoneinfo(pc);
        if (!mz)
                return NULL;
@@ -1615,7 +1607,7 @@ void mem_cgroup_update_page_stat(struct page *page,
        if (unlikely(!mem || !PageCgroupUsed(pc)))
                goto out;
        /* pc->mem_cgroup is unstable ? */
-        if (unlikely(mem_cgroup_stealed(mem))) {
+        if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
                /* take a lock against to access pc->mem_cgroup */
                move_lock_page_cgroup(pc, &flags);
                need_unlock = true;
@@ -1840,6 +1832,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
                if (likely(!ret))
                        return CHARGE_OK;
+                res_counter_uncharge(&mem->res, csize);
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
        } else
@@ -2084,14 +2077,27 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
        return mem;
 }
-/*
+static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
- * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
+                                       struct page_cgroup *pc,
- * USED state. If already USED, uncharge and return.
+                                       enum charge_type ctype,
- */
+                                       int page_size)
-static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
-                                         struct page_cgroup *pc,
-                                         enum charge_type ctype)
 {
+        int nr_pages = page_size >> PAGE_SHIFT;
+        /* try_charge() can return NULL to *memcg, taking care of it. */
+        if (!mem)
+                return;
+        lock_page_cgroup(pc);
+        if (unlikely(PageCgroupUsed(pc))) {
+                unlock_page_cgroup(pc);
+                mem_cgroup_cancel_charge(mem, page_size);
+                return;
+        }
+        /*
+         * we don't need page_cgroup_lock about tail pages, becase they are not
+         * accessed by any other context at this point.
+         */
        pc->mem_cgroup = mem;
        /*
         * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2115,43 +2121,57 @@ static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
                break;
        }
-        mem_cgroup_charge_statistics(mem, pc, true);
+        mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
+        unlock_page_cgroup(pc);
+        /*
+         * "charge_statistics" updated event counter. Then, check it.
+         * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+         * if they exceeds softlimit.
+         */
+        memcg_check_events(mem, pc->page);
 }
-static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-                                       struct page_cgroup *pc,
-                                       enum charge_type ctype,
-                                       int page_size)
-{
-        int i;
-        int count = page_size >> PAGE_SHIFT;
-        /* try_charge() can return NULL to *memcg, taking care of it. */
+#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
-        if (!mem)
+                        (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
-                return;
+/*
+ * Because tail pages are not marked as "used", set it. We're under
+ * zone->lru_lock, 'splitting on pmd' and compund_lock.
+ */
+void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
+{
+        struct page_cgroup *head_pc = lookup_page_cgroup(head);
+        struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
+        unsigned long flags;
-        lock_page_cgroup(pc);
+        if (mem_cgroup_disabled())
-        if (unlikely(PageCgroupUsed(pc))) {
-                unlock_page_cgroup(pc);
-                mem_cgroup_cancel_charge(mem, page_size);
                return;
-        }
        /*
-         * we don't need page_cgroup_lock about tail pages, becase they are not
+         * We have no races with charge/uncharge but will have races with
-         * accessed by any other context at this point.
+         * page state accounting.
         */
-        for (i = 0; i < count; i++)
+        move_lock_page_cgroup(head_pc, &flags);
-                ____mem_cgroup_commit_charge(mem, pc + i, ctype);
-        unlock_page_cgroup(pc);
+        tail_pc->mem_cgroup = head_pc->mem_cgroup;
-        /*
+        smp_wmb(); /* see __commit_charge() */
-         * "charge_statistics" updated event counter. Then, check it.
+        if (PageCgroupAcctLRU(head_pc)) {
-         * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+                enum lru_list lru;
-         * if they exceeds softlimit.
+                struct mem_cgroup_per_zone *mz;
-         */
-        memcg_check_events(mem, pc->page);
+                /*
+                 * LRU flags cannot be copied because we need to add tail
+                 *.page to LRU by generic call and our hook will be called.
+                 * We hold lru_lock, then, reduce counter directly.
+                 */
+                lru = page_lru(head);
+                mz = page_cgroup_zoneinfo(head_pc);
+                MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+        }
+        tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+        move_unlock_page_cgroup(head_pc, &flags);
 }
+#endif
 /**
 * __mem_cgroup_move_account - move account of the page
@@ -2171,8 +2191,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 */
 static void __mem_cgroup_move_account(struct page_cgroup *pc,
-        struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
+        struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge,
+        int charge_size)
 {
+        int nr_pages = charge_size >> PAGE_SHIFT;
        VM_BUG_ON(from == to);
        VM_BUG_ON(PageLRU(pc->page));
        VM_BUG_ON(!page_is_cgroup_locked(pc));
@@ -2186,14 +2209,14 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
                __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
                preempt_enable();
        }
-        mem_cgroup_charge_statistics(from, pc, false);
+        mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
        if (uncharge)
                /* This is not "cancel", but cancel_charge does all we need. */
-                mem_cgroup_cancel_charge(from, PAGE_SIZE);
+                mem_cgroup_cancel_charge(from, charge_size);
        /* caller should have done css_get */
        pc->mem_cgroup = to;
-        mem_cgroup_charge_statistics(to, pc, true);
+        mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
        /*
         * We charges against "to" which may not have any tasks. Then, "to"
         * can be under rmdir(). But in current implementation, caller of
@@ -2208,15 +2231,24 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
 * __mem_cgroup_move_account()
 */
 static int mem_cgroup_move_account(struct page_cgroup *pc,
-                struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
+                struct mem_cgroup *from, struct mem_cgroup *to,
+                bool uncharge, int charge_size)
 {
        int ret = -EINVAL;
        unsigned long flags;
+        /*
+         * The page is isolated from LRU. So, collapse function
+         * will not handle this page. But page splitting can happen.
+         * Do this check under compound_page_lock(). The caller should
+         * hold it.
+         */
+        if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
+                return -EBUSY;
        lock_page_cgroup(pc);
        if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
                move_lock_page_cgroup(pc, &flags);
-                __mem_cgroup_move_account(pc, from, to, uncharge);
+                __mem_cgroup_move_account(pc, from, to, uncharge, charge_size);
                move_unlock_page_cgroup(pc, &flags);
                ret = 0;
        }
@@ -2241,6 +2273,8 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
        struct cgroup *cg = child->css.cgroup;
        struct cgroup *pcg = cg->parent;
        struct mem_cgroup *parent;
+        int page_size = PAGE_SIZE;
+        unsigned long flags;
        int ret;
        /* Is ROOT ? */
@@ -2253,15 +2287,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
        if (isolate_lru_page(page))
                goto put;
+        if (PageTransHuge(page))
+                page_size = HPAGE_SIZE;
        parent = mem_cgroup_from_cont(pcg);
-        ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false,
+        ret = __mem_cgroup_try_charge(NULL, gfp_mask,
-                                      PAGE_SIZE);
+                                &parent, false, page_size);
        if (ret || !parent)
                goto put_back;
-        ret = mem_cgroup_move_account(pc, child, parent, true);
+        if (page_size > PAGE_SIZE)
+                flags = compound_lock_irqsave(page);
+        ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
        if (ret)
-                mem_cgroup_cancel_charge(parent, PAGE_SIZE);
+                mem_cgroup_cancel_charge(parent, page_size);
+        if (page_size > PAGE_SIZE)
+                compound_unlock_irqrestore(page, flags);
 put_back:
        putback_lru_page(page);
 put:
@@ -2546,7 +2589,6 @@ direct_uncharge:
 static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
-        int i;
        int count;
        struct page_cgroup *pc;
        struct mem_cgroup *mem = NULL;
@@ -2596,8 +2638,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
-        for (i = 0; i < count; i++)
+        mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count);
-                mem_cgroup_charge_statistics(mem, pc + i, false);
        ClearPageCgroupUsed(pc);
        /*
@@ -4844,7 +4885,7 @@ retry:
                                goto put;
                        pc = lookup_page_cgroup(page);
                        if (!mem_cgroup_move_account(pc,
-                                                mc.from, mc.to, false)) {
+                                        mc.from, mc.to, false, PAGE_SIZE)) {
                                mc.precharge--;
                                /* we uncharge from mc.from later. */
                                mc.moved_charge++;

diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8ab841031436..3878cfe399dc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -600,23 +600,22 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
600	}	600	}
601		601
602	static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,	602	static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
603	struct page_cgroup *pc,	603	bool file, int nr_pages)
604	bool charge)
605	{	604	{
606	int val = (charge) ? 1 : -1;
607
608	preempt_disable();	605	preempt_disable();
609		606
610	if (PageCgroupCache(pc))	607	if (file)
611	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);	608	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
612	else	609	else
613	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);	610	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
614		611
615	if (charge)	612	/* pagein of a big page is an event. So, ignore page size */
		613	if (nr_pages > 0)
616	__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);	614	__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
617	else	615	else
618	__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);	616	__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
619	__this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);	617
		618	__this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
620		619
621	preempt_enable();	620	preempt_enable();
622	}	621	}
@@ -815,7 +814,8 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
815	* removed from global LRU.	814	* removed from global LRU.
816	*/	815	*/
817	mz = page_cgroup_zoneinfo(pc);	816	mz = page_cgroup_zoneinfo(pc);
818	MEM_CGROUP_ZSTAT(mz, lru) -= 1;	817	/* huge page split is done under lru_lock. so, we have no races. */
		818	MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
819	if (mem_cgroup_is_root(pc->mem_cgroup))	819	if (mem_cgroup_is_root(pc->mem_cgroup))
820	return;	820	return;
821	VM_BUG_ON(list_empty(&pc->lru));	821	VM_BUG_ON(list_empty(&pc->lru));
@@ -836,13 +836,12 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
836	return;	836	return;
837		837
838	pc = lookup_page_cgroup(page);	838	pc = lookup_page_cgroup(page);
839	/*
840	* Used bit is set without atomic ops but after smp_wmb().
841	* For making pc->mem_cgroup visible, insert smp_rmb() here.
842	*/
843	smp_rmb();
844	/* unused or root page is not rotated. */	839	/* unused or root page is not rotated. */
845	if (!PageCgroupUsed(pc) \|\| mem_cgroup_is_root(pc->mem_cgroup))	840	if (!PageCgroupUsed(pc))
		841	return;
		842	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
		843	smp_rmb();
		844	if (mem_cgroup_is_root(pc->mem_cgroup))
846	return;	845	return;
847	mz = page_cgroup_zoneinfo(pc);	846	mz = page_cgroup_zoneinfo(pc);
848	list_move(&pc->lru, &mz->lists[lru]);	847	list_move(&pc->lru, &mz->lists[lru]);
@@ -857,16 +856,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
857	return;	856	return;
858	pc = lookup_page_cgroup(page);	857	pc = lookup_page_cgroup(page);
859	VM_BUG_ON(PageCgroupAcctLRU(pc));	858	VM_BUG_ON(PageCgroupAcctLRU(pc));
860	/*
861	* Used bit is set without atomic ops but after smp_wmb().
862	* For making pc->mem_cgroup visible, insert smp_rmb() here.
863	*/
864	smp_rmb();
865	if (!PageCgroupUsed(pc))	859	if (!PageCgroupUsed(pc))
866	return;	860	return;
867		861	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
		862	smp_rmb();
868	mz = page_cgroup_zoneinfo(pc);	863	mz = page_cgroup_zoneinfo(pc);
869	MEM_CGROUP_ZSTAT(mz, lru) += 1;	864	/* huge page split is done under lru_lock. so, we have no races. */
		865	MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
870	SetPageCgroupAcctLRU(pc);	866	SetPageCgroupAcctLRU(pc);
871	if (mem_cgroup_is_root(pc->mem_cgroup))	867	if (mem_cgroup_is_root(pc->mem_cgroup))
872	return;	868	return;
@@ -1030,14 +1026,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1030	return NULL;	1026	return NULL;
1031		1027
1032	pc = lookup_page_cgroup(page);	1028	pc = lookup_page_cgroup(page);
1033	/*
1034	* Used bit is set without atomic ops but after smp_wmb().
1035	* For making pc->mem_cgroup visible, insert smp_rmb() here.
1036	*/
1037	smp_rmb();
1038	if (!PageCgroupUsed(pc))	1029	if (!PageCgroupUsed(pc))
1039	return NULL;	1030	return NULL;
1040		1031	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
		1032	smp_rmb();
1041	mz = page_cgroup_zoneinfo(pc);	1033	mz = page_cgroup_zoneinfo(pc);
1042	if (!mz)	1034	if (!mz)
1043	return NULL;	1035	return NULL;
@@ -1615,7 +1607,7 @@ void mem_cgroup_update_page_stat(struct page *page,
1615	if (unlikely(!mem \|\| !PageCgroupUsed(pc)))	1607	if (unlikely(!mem \|\| !PageCgroupUsed(pc)))
1616	goto out;	1608	goto out;
1617	/* pc->mem_cgroup is unstable ? */	1609	/* pc->mem_cgroup is unstable ? */
1618	if (unlikely(mem_cgroup_stealed(mem))) {	1610	if (unlikely(mem_cgroup_stealed(mem)) \|\| PageTransHuge(page)) {
1619	/* take a lock against to access pc->mem_cgroup */	1611	/* take a lock against to access pc->mem_cgroup */
1620	move_lock_page_cgroup(pc, &flags);	1612	move_lock_page_cgroup(pc, &flags);
1621	need_unlock = true;	1613	need_unlock = true;
@@ -1840,6 +1832,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1840	if (likely(!ret))	1832	if (likely(!ret))
1841	return CHARGE_OK;	1833	return CHARGE_OK;
1842		1834
		1835	res_counter_uncharge(&mem->res, csize);
1843	mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);	1836	mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1844	flags \|= MEM_CGROUP_RECLAIM_NOSWAP;	1837	flags \|= MEM_CGROUP_RECLAIM_NOSWAP;
1845	} else	1838	} else
@@ -2084,14 +2077,27 @@ struct mem_cgroup try_get_mem_cgroup_from_page(struct page page)
2084	return mem;	2077	return mem;
2085	}	2078	}
2086		2079
2087	/*	2080	static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2088	* commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be	2081	struct page_cgroup *pc,
2089	* USED state. If already USED, uncharge and return.	2082	enum charge_type ctype,
2090	*/	2083	int page_size)
2091	static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
2092	struct page_cgroup *pc,
2093	enum charge_type ctype)
2094	{	2084	{
		2085	int nr_pages = page_size >> PAGE_SHIFT;
		2086
		2087	/* try_charge() can return NULL to memcg, taking care of it. /
		2088	if (!mem)
		2089	return;
		2090
		2091	lock_page_cgroup(pc);
		2092	if (unlikely(PageCgroupUsed(pc))) {
		2093	unlock_page_cgroup(pc);
		2094	mem_cgroup_cancel_charge(mem, page_size);
		2095	return;
		2096	}
		2097	/*
		2098	* we don't need page_cgroup_lock about tail pages, becase they are not
		2099	* accessed by any other context at this point.
		2100	*/
2095	pc->mem_cgroup = mem;	2101	pc->mem_cgroup = mem;
2096	/*	2102	/*
2097	* We access a page_cgroup asynchronously without lock_page_cgroup().	2103	* We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2115,43 +2121,57 @@ static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
2115	break;	2121	break;
2116	}	2122	}
2117		2123
2118	mem_cgroup_charge_statistics(mem, pc, true);	2124	mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
		2125	unlock_page_cgroup(pc);
		2126	/*
		2127	* "charge_statistics" updated event counter. Then, check it.
		2128	* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
		2129	* if they exceeds softlimit.
		2130	*/
		2131	memcg_check_events(mem, pc->page);
2119	}	2132	}
2120		2133
2121	static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,	2134	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2122	struct page_cgroup *pc,
2123	enum charge_type ctype,
2124	int page_size)
2125	{
2126	int i;
2127	int count = page_size >> PAGE_SHIFT;
2128		2135
2129	/* try_charge() can return NULL to memcg, taking care of it. /	2136	#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) \| (1 << PCG_MOVE_LOCK) \|\
2130	if (!mem)	2137	(1 << PCG_ACCT_LRU) \| (1 << PCG_MIGRATION))
2131	return;	2138	/*
		2139	* Because tail pages are not marked as "used", set it. We're under
		2140	* zone->lru_lock, 'splitting on pmd' and compund_lock.
		2141	*/
		2142	void mem_cgroup_split_huge_fixup(struct page head, struct page tail)
		2143	{
		2144	struct page_cgroup *head_pc = lookup_page_cgroup(head);
		2145	struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
		2146	unsigned long flags;
2132		2147
2133	lock_page_cgroup(pc);	2148	if (mem_cgroup_disabled())
2134	if (unlikely(PageCgroupUsed(pc))) {
2135	unlock_page_cgroup(pc);
2136	mem_cgroup_cancel_charge(mem, page_size);
2137	return;	2149	return;
2138	}
2139
2140	/*	2150	/*
2141	* we don't need page_cgroup_lock about tail pages, becase they are not	2151	* We have no races with charge/uncharge but will have races with
2142	* accessed by any other context at this point.	2152	* page state accounting.
2143	*/	2153	*/
2144	for (i = 0; i < count; i++)	2154	move_lock_page_cgroup(head_pc, &flags);
2145	____mem_cgroup_commit_charge(mem, pc + i, ctype);
2146		2155
2147	unlock_page_cgroup(pc);	2156	tail_pc->mem_cgroup = head_pc->mem_cgroup;
2148	/*	2157	smp_wmb(); /* see __commit_charge() */
2149	* "charge_statistics" updated event counter. Then, check it.	2158	if (PageCgroupAcctLRU(head_pc)) {
2150	* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.	2159	enum lru_list lru;
2151	* if they exceeds softlimit.	2160	struct mem_cgroup_per_zone *mz;
2152	*/	2161
2153	memcg_check_events(mem, pc->page);	2162	/*
		2163	* LRU flags cannot be copied because we need to add tail
		2164	*.page to LRU by generic call and our hook will be called.
		2165	* We hold lru_lock, then, reduce counter directly.
		2166	*/
		2167	lru = page_lru(head);
		2168	mz = page_cgroup_zoneinfo(head_pc);
		2169	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
		2170	}
		2171	tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
		2172	move_unlock_page_cgroup(head_pc, &flags);
2154	}	2173	}
		2174	#endif
2155		2175
2156	/**	2176	/**
2157	* __mem_cgroup_move_account - move account of the page	2177	* __mem_cgroup_move_account - move account of the page
@@ -2171,8 +2191,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2171	*/	2191	*/
2172		2192
2173	static void __mem_cgroup_move_account(struct page_cgroup *pc,	2193	static void __mem_cgroup_move_account(struct page_cgroup *pc,
2174	struct mem_cgroup from, struct mem_cgroup to, bool uncharge)	2194	struct mem_cgroup from, struct mem_cgroup to, bool uncharge,
		2195	int charge_size)
2175	{	2196	{
		2197	int nr_pages = charge_size >> PAGE_SHIFT;
		2198
2176	VM_BUG_ON(from == to);	2199	VM_BUG_ON(from == to);
2177	VM_BUG_ON(PageLRU(pc->page));	2200	VM_BUG_ON(PageLRU(pc->page));
2178	VM_BUG_ON(!page_is_cgroup_locked(pc));	2201	VM_BUG_ON(!page_is_cgroup_locked(pc));
@@ -2186,14 +2209,14 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
2186	__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);	2209	__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2187	preempt_enable();	2210	preempt_enable();
2188	}	2211	}
2189	mem_cgroup_charge_statistics(from, pc, false);	2212	mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
2190	if (uncharge)	2213	if (uncharge)
2191	/* This is not "cancel", but cancel_charge does all we need. */	2214	/* This is not "cancel", but cancel_charge does all we need. */
2192	mem_cgroup_cancel_charge(from, PAGE_SIZE);	2215	mem_cgroup_cancel_charge(from, charge_size);
2193		2216
2194	/* caller should have done css_get */	2217	/* caller should have done css_get */
2195	pc->mem_cgroup = to;	2218	pc->mem_cgroup = to;
2196	mem_cgroup_charge_statistics(to, pc, true);	2219	mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
2197	/*	2220	/*
2198	* We charges against "to" which may not have any tasks. Then, "to"	2221	* We charges against "to" which may not have any tasks. Then, "to"
2199	* can be under rmdir(). But in current implementation, caller of	2222	* can be under rmdir(). But in current implementation, caller of
@@ -2208,15 +2231,24 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
2208	* __mem_cgroup_move_account()	2231	* __mem_cgroup_move_account()
2209	*/	2232	*/
2210	static int mem_cgroup_move_account(struct page_cgroup *pc,	2233	static int mem_cgroup_move_account(struct page_cgroup *pc,
2211	struct mem_cgroup from, struct mem_cgroup to, bool uncharge)	2234	struct mem_cgroup from, struct mem_cgroup to,
		2235	bool uncharge, int charge_size)
2212	{	2236	{
2213	int ret = -EINVAL;	2237	int ret = -EINVAL;
2214	unsigned long flags;	2238	unsigned long flags;
		2239	/*
		2240	* The page is isolated from LRU. So, collapse function
		2241	* will not handle this page. But page splitting can happen.
		2242	* Do this check under compound_page_lock(). The caller should
		2243	* hold it.
		2244	*/
		2245	if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
		2246	return -EBUSY;
2215		2247
2216	lock_page_cgroup(pc);	2248	lock_page_cgroup(pc);
2217	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {	2249	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
2218	move_lock_page_cgroup(pc, &flags);	2250	move_lock_page_cgroup(pc, &flags);
2219	__mem_cgroup_move_account(pc, from, to, uncharge);	2251	__mem_cgroup_move_account(pc, from, to, uncharge, charge_size);
2220	move_unlock_page_cgroup(pc, &flags);	2252	move_unlock_page_cgroup(pc, &flags);
2221	ret = 0;	2253	ret = 0;
2222	}	2254	}
@@ -2241,6 +2273,8 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2241	struct cgroup *cg = child->css.cgroup;	2273	struct cgroup *cg = child->css.cgroup;
2242	struct cgroup *pcg = cg->parent;	2274	struct cgroup *pcg = cg->parent;
2243	struct mem_cgroup *parent;	2275	struct mem_cgroup *parent;
		2276	int page_size = PAGE_SIZE;
		2277	unsigned long flags;
2244	int ret;	2278	int ret;
2245		2279
2246	/* Is ROOT ? */	2280	/* Is ROOT ? */
@@ -2253,15 +2287,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2253	if (isolate_lru_page(page))	2287	if (isolate_lru_page(page))
2254	goto put;	2288	goto put;
2255		2289
		2290	if (PageTransHuge(page))
		2291	page_size = HPAGE_SIZE;
		2292
2256	parent = mem_cgroup_from_cont(pcg);	2293	parent = mem_cgroup_from_cont(pcg);
2257	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false,	2294	ret = __mem_cgroup_try_charge(NULL, gfp_mask,
2258	PAGE_SIZE);	2295	&parent, false, page_size);
2259	if (ret \|\| !parent)	2296	if (ret \|\| !parent)
2260	goto put_back;	2297	goto put_back;
2261		2298
2262	ret = mem_cgroup_move_account(pc, child, parent, true);	2299	if (page_size > PAGE_SIZE)
		2300	flags = compound_lock_irqsave(page);
		2301
		2302	ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
2263	if (ret)	2303	if (ret)
2264	mem_cgroup_cancel_charge(parent, PAGE_SIZE);	2304	mem_cgroup_cancel_charge(parent, page_size);
		2305
		2306	if (page_size > PAGE_SIZE)
		2307	compound_unlock_irqrestore(page, flags);
2265	put_back:	2308	put_back:
2266	putback_lru_page(page);	2309	putback_lru_page(page);
2267	put:	2310	put:
@@ -2546,7 +2589,6 @@ direct_uncharge:
2546	static struct mem_cgroup *	2589	static struct mem_cgroup *
2547	__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)	2590	__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2548	{	2591	{
2549	int i;
2550	int count;	2592	int count;
2551	struct page_cgroup *pc;	2593	struct page_cgroup *pc;
2552	struct mem_cgroup *mem = NULL;	2594	struct mem_cgroup *mem = NULL;
@@ -2596,8 +2638,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2596	break;	2638	break;
2597	}	2639	}
2598		2640
2599	for (i = 0; i < count; i++)	2641	mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count);
2600	mem_cgroup_charge_statistics(mem, pc + i, false);
2601		2642
2602	ClearPageCgroupUsed(pc);	2643	ClearPageCgroupUsed(pc);
2603	/*	2644	/*
@@ -4844,7 +4885,7 @@ retry:
4844	goto put;	4885	goto put;
4845	pc = lookup_page_cgroup(page);	4886	pc = lookup_page_cgroup(page);
4846	if (!mem_cgroup_move_account(pc,	4887	if (!mem_cgroup_move_account(pc,
4847	mc.from, mc.to, false)) {	4888	mc.from, mc.to, false, PAGE_SIZE)) {
4848	mc.precharge--;	4889	mc.precharge--;
4849	/* we uncharge from mc.from later. */	4890	/* we uncharge from mc.from later. */
4850	mc.moved_charge++;	4891	mc.moved_charge++;