1 files changed, 73 insertions, 25 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index db76ef726293..da53a252b259 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -612,8 +612,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
        /* pagein of a big page is an event. So, ignore page size */
        if (nr_pages > 0)
                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
-        else
+        else {
                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
+                nr_pages = -nr_pages; /* for event */
+        }
        __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
@@ -1111,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
        return false;
 }
+/**
+ * mem_cgroup_check_margin - check if the memory cgroup allows charging
+ * @mem: memory cgroup to check
+ * @bytes: the number of bytes the caller intends to charge
+ *
+ * Returns a boolean value on whether @mem can be charged @bytes or
+ * whether this would exceed the limit.
+ */
+static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
+{
+        if (!res_counter_check_margin(&mem->res, bytes))
+                return false;
+        if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
+                return false;
+        return true;
+}
 static unsigned int get_swappiness(struct mem_cgroup *memcg)
 {
        struct cgroup *cgrp = memcg->css.cgroup;
@@ -1832,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
                if (likely(!ret))
                        return CHARGE_OK;
+                res_counter_uncharge(&mem->res, csize);
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
        } else
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+        /*
-        if (csize > PAGE_SIZE) /* change csize and retry */
+         * csize can be either a huge page (HPAGE_SIZE), a batch of
+         * regular pages (CHARGE_SIZE), or a single regular page
+         * (PAGE_SIZE).
+         *
+         * Never reclaim on behalf of optional batching, retry with a
+         * single page instead.
+         */
+        if (csize == CHARGE_SIZE)
                return CHARGE_RETRY;
        if (!(gfp_mask & __GFP_WAIT))
                return CHARGE_WOULDBLOCK;
        ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
-                                        gfp_mask, flags);
+                                              gfp_mask, flags);
+        if (mem_cgroup_check_margin(mem_over_limit, csize))
+                return CHARGE_RETRY;
        /*
-         * try_to_free_mem_cgroup_pages() might not give us a full
+         * Even though the limit is exceeded at this point, reclaim
-         * picture of reclaim. Some pages are reclaimed and might be
+         * may have been able to free some pages.  Retry the charge
-         * moved to swap cache or just unmapped from the cgroup.
+         * before killing the task.
-         * Check the limit again to see if the reclaim reduced the
+         *
-         * current usage of the cgroup before giving up
+         * Only for regular pages, though: huge pages are rather
+         * unlikely to succeed so close to the limit, and we fall back
+         * to regular pages anyway in case of failure.
         */
-        if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+        if (csize == PAGE_SIZE && ret)
                return CHARGE_RETRY;
        /*
@@ -2144,6 +2175,8 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
        struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
        unsigned long flags;
+        if (mem_cgroup_disabled())
+                return;
        /*
         * We have no races with charge/uncharge but will have races with
         * page state accounting.
@@ -2233,7 +2266,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 {
        int ret = -EINVAL;
        unsigned long flags;
+        /*
+         * The page is isolated from LRU. So, collapse function
+         * will not handle this page. But page splitting can happen.
+         * Do this check under compound_page_lock(). The caller should
+         * hold it.
+         */
        if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
                return -EBUSY;
@@ -2265,7 +2303,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
        struct cgroup *cg = child->css.cgroup;
        struct cgroup *pcg = cg->parent;
        struct mem_cgroup *parent;
-        int charge = PAGE_SIZE;
+        int page_size = PAGE_SIZE;
        unsigned long flags;
        int ret;
@@ -2278,23 +2316,26 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
                goto out;
        if (isolate_lru_page(page))
                goto put;
-        /* The page is isolated from LRU and we have no race with splitting */
-        charge = PAGE_SIZE << compound_order(page);
+        if (PageTransHuge(page))
+                page_size = HPAGE_SIZE;
        parent = mem_cgroup_from_cont(pcg);
-        ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, charge);
+        ret = __mem_cgroup_try_charge(NULL, gfp_mask,
+                                &parent, false, page_size);
        if (ret || !parent)
                goto put_back;
-        if (charge > PAGE_SIZE)
+        if (page_size > PAGE_SIZE)
                flags = compound_lock_irqsave(page);
-        ret = mem_cgroup_move_account(pc, child, parent, true, charge);
+        ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
        if (ret)
-                mem_cgroup_cancel_charge(parent, charge);
+                mem_cgroup_cancel_charge(parent, page_size);
-put_back:
-        if (charge > PAGE_SIZE)
+        if (page_size > PAGE_SIZE)
                compound_unlock_irqrestore(page, flags);
+put_back:
        putback_lru_page(page);
 put:
        put_page(page);
@@ -2312,13 +2353,19 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask, enum charge_type ctype)
 {
        struct mem_cgroup *mem = NULL;
+        int page_size = PAGE_SIZE;
        struct page_cgroup *pc;
+        bool oom = true;
        int ret;
-        int page_size = PAGE_SIZE;
        if (PageTransHuge(page)) {
                page_size <<= compound_order(page);
                VM_BUG_ON(!PageTransHuge(page));
+                /*
+                 * Never OOM-kill a process for a huge page.  The
+                 * fault handler will fall back to regular pages.
+                 */
+                oom = false;
        }
        pc = lookup_page_cgroup(page);
@@ -2327,7 +2374,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                return 0;
        prefetchw(pc);
-        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);
+        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
        if (ret || !mem)
                return ret;
@@ -5013,9 +5060,9 @@ struct cgroup_subsys mem_cgroup_subsys = {
 static int __init enable_swap_account(char *s)
 {
        /* consider enabled if no parameter or 1 is given */
-        if (!s || !strcmp(s, "1"))
+        if (!(*s) || !strcmp(s, "=1"))
                really_do_swap_account = 1;
-        else if (!strcmp(s, "0"))
+        else if (!strcmp(s, "=0"))
                really_do_swap_account = 0;
        return 1;
 }
@@ -5023,7 +5070,8 @@ __setup("swapaccount", enable_swap_account);
 static int __init disable_swap_account(char *s)
 {
-        enable_swap_account("0");
+        printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
+        enable_swap_account("=0");
        return 1;
 }
 __setup("noswapaccount", disable_swap_account);

diff --git a/mm/memcontrol.c b/mm/memcontrol.c index db76ef726293..da53a252b259 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -612,8 +612,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
612	/* pagein of a big page is an event. So, ignore page size */	612	/* pagein of a big page is an event. So, ignore page size */
613	if (nr_pages > 0)	613	if (nr_pages > 0)
614	__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);	614	__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
615	else	615	else {
616	__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);	616	__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
		617	nr_pages = -nr_pages; /* for event */
		618	}
617		619
618	__this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);	620	__this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
619		621
@@ -1111,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
1111	return false;	1113	return false;
1112	}	1114	}
1113		1115
		1116	/**
		1117	* mem_cgroup_check_margin - check if the memory cgroup allows charging
		1118	* @mem: memory cgroup to check
		1119	* @bytes: the number of bytes the caller intends to charge
		1120	*
		1121	* Returns a boolean value on whether @mem can be charged @bytes or
		1122	* whether this would exceed the limit.
		1123	*/
		1124	static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
		1125	{
		1126	if (!res_counter_check_margin(&mem->res, bytes))
		1127	return false;
		1128	if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
		1129	return false;
		1130	return true;
		1131	}
		1132
1114	static unsigned int get_swappiness(struct mem_cgroup *memcg)	1133	static unsigned int get_swappiness(struct mem_cgroup *memcg)
1115	{	1134	{
1116	struct cgroup *cgrp = memcg->css.cgroup;	1135	struct cgroup *cgrp = memcg->css.cgroup;
@@ -1832,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1832	if (likely(!ret))	1851	if (likely(!ret))
1833	return CHARGE_OK;	1852	return CHARGE_OK;
1834		1853
		1854	res_counter_uncharge(&mem->res, csize);
1835	mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);	1855	mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1836	flags \|= MEM_CGROUP_RECLAIM_NOSWAP;	1856	flags \|= MEM_CGROUP_RECLAIM_NOSWAP;
1837	} else	1857	} else
1838	mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);	1858	mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1839		1859	/*
1840	if (csize > PAGE_SIZE) /* change csize and retry */	1860	* csize can be either a huge page (HPAGE_SIZE), a batch of
		1861	* regular pages (CHARGE_SIZE), or a single regular page
		1862	* (PAGE_SIZE).
		1863	*
		1864	* Never reclaim on behalf of optional batching, retry with a
		1865	* single page instead.
		1866	*/
		1867	if (csize == CHARGE_SIZE)
1841	return CHARGE_RETRY;	1868	return CHARGE_RETRY;
1842		1869
1843	if (!(gfp_mask & __GFP_WAIT))	1870	if (!(gfp_mask & __GFP_WAIT))
1844	return CHARGE_WOULDBLOCK;	1871	return CHARGE_WOULDBLOCK;
1845		1872
1846	ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,	1873	ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1847	gfp_mask, flags);	1874	gfp_mask, flags);
		1875	if (mem_cgroup_check_margin(mem_over_limit, csize))
		1876	return CHARGE_RETRY;
1848	/*	1877	/*
1849	* try_to_free_mem_cgroup_pages() might not give us a full	1878	* Even though the limit is exceeded at this point, reclaim
1850	* picture of reclaim. Some pages are reclaimed and might be	1879	* may have been able to free some pages. Retry the charge
1851	* moved to swap cache or just unmapped from the cgroup.	1880	* before killing the task.
1852	* Check the limit again to see if the reclaim reduced the	1881	*
1853	* current usage of the cgroup before giving up	1882	* Only for regular pages, though: huge pages are rather
		1883	* unlikely to succeed so close to the limit, and we fall back
		1884	* to regular pages anyway in case of failure.
1854	*/	1885	*/
1855	if (ret \|\| mem_cgroup_check_under_limit(mem_over_limit))	1886	if (csize == PAGE_SIZE && ret)
1856	return CHARGE_RETRY;	1887	return CHARGE_RETRY;
1857		1888
1858	/*	1889	/*
@@ -2144,6 +2175,8 @@ void mem_cgroup_split_huge_fixup(struct page head, struct page tail)
2144	struct page_cgroup *tail_pc = lookup_page_cgroup(tail);	2175	struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
2145	unsigned long flags;	2176	unsigned long flags;
2146		2177
		2178	if (mem_cgroup_disabled())
		2179	return;
2147	/*	2180	/*
2148	* We have no races with charge/uncharge but will have races with	2181	* We have no races with charge/uncharge but will have races with
2149	* page state accounting.	2182	* page state accounting.
@@ -2233,7 +2266,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
2233	{	2266	{
2234	int ret = -EINVAL;	2267	int ret = -EINVAL;
2235	unsigned long flags;	2268	unsigned long flags;
2236		2269	/*
		2270	* The page is isolated from LRU. So, collapse function
		2271	* will not handle this page. But page splitting can happen.
		2272	* Do this check under compound_page_lock(). The caller should
		2273	* hold it.
		2274	*/
2237	if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))	2275	if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
2238	return -EBUSY;	2276	return -EBUSY;
2239		2277
@@ -2265,7 +2303,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2265	struct cgroup *cg = child->css.cgroup;	2303	struct cgroup *cg = child->css.cgroup;
2266	struct cgroup *pcg = cg->parent;	2304	struct cgroup *pcg = cg->parent;
2267	struct mem_cgroup *parent;	2305	struct mem_cgroup *parent;
2268	int charge = PAGE_SIZE;	2306	int page_size = PAGE_SIZE;
2269	unsigned long flags;	2307	unsigned long flags;
2270	int ret;	2308	int ret;
2271		2309
@@ -2278,23 +2316,26 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2278	goto out;	2316	goto out;
2279	if (isolate_lru_page(page))	2317	if (isolate_lru_page(page))
2280	goto put;	2318	goto put;
2281	/* The page is isolated from LRU and we have no race with splitting */	2319
2282	charge = PAGE_SIZE << compound_order(page);	2320	if (PageTransHuge(page))
		2321	page_size = HPAGE_SIZE;
2283		2322
2284	parent = mem_cgroup_from_cont(pcg);	2323	parent = mem_cgroup_from_cont(pcg);
2285	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, charge);	2324	ret = __mem_cgroup_try_charge(NULL, gfp_mask,
		2325	&parent, false, page_size);
2286	if (ret \|\| !parent)	2326	if (ret \|\| !parent)
2287	goto put_back;	2327	goto put_back;
2288		2328
2289	if (charge > PAGE_SIZE)	2329	if (page_size > PAGE_SIZE)
2290	flags = compound_lock_irqsave(page);	2330	flags = compound_lock_irqsave(page);
2291		2331
2292	ret = mem_cgroup_move_account(pc, child, parent, true, charge);	2332	ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
2293	if (ret)	2333	if (ret)
2294	mem_cgroup_cancel_charge(parent, charge);	2334	mem_cgroup_cancel_charge(parent, page_size);
2295	put_back:	2335
2296	if (charge > PAGE_SIZE)	2336	if (page_size > PAGE_SIZE)
2297	compound_unlock_irqrestore(page, flags);	2337	compound_unlock_irqrestore(page, flags);
		2338	put_back:
2298	putback_lru_page(page);	2339	putback_lru_page(page);
2299	put:	2340	put:
2300	put_page(page);	2341	put_page(page);
@@ -2312,13 +2353,19 @@ static int mem_cgroup_charge_common(struct page page, struct mm_struct mm,
2312	gfp_t gfp_mask, enum charge_type ctype)	2353	gfp_t gfp_mask, enum charge_type ctype)
2313	{	2354	{
2314	struct mem_cgroup *mem = NULL;	2355	struct mem_cgroup *mem = NULL;
		2356	int page_size = PAGE_SIZE;
2315	struct page_cgroup *pc;	2357	struct page_cgroup *pc;
		2358	bool oom = true;
2316	int ret;	2359	int ret;
2317	int page_size = PAGE_SIZE;
2318		2360
2319	if (PageTransHuge(page)) {	2361	if (PageTransHuge(page)) {
2320	page_size <<= compound_order(page);	2362	page_size <<= compound_order(page);
2321	VM_BUG_ON(!PageTransHuge(page));	2363	VM_BUG_ON(!PageTransHuge(page));
		2364	/*
		2365	* Never OOM-kill a process for a huge page. The
		2366	* fault handler will fall back to regular pages.
		2367	*/
		2368	oom = false;
2322	}	2369	}
2323		2370
2324	pc = lookup_page_cgroup(page);	2371	pc = lookup_page_cgroup(page);
@@ -2327,7 +2374,7 @@ static int mem_cgroup_charge_common(struct page page, struct mm_struct mm,
2327	return 0;	2374	return 0;
2328	prefetchw(pc);	2375	prefetchw(pc);
2329		2376
2330	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);	2377	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
2331	if (ret \|\| !mem)	2378	if (ret \|\| !mem)
2332	return ret;	2379	return ret;
2333		2380
@@ -5013,9 +5060,9 @@ struct cgroup_subsys mem_cgroup_subsys = {
5013	static int __init enable_swap_account(char *s)	5060	static int __init enable_swap_account(char *s)
5014	{	5061	{
5015	/* consider enabled if no parameter or 1 is given */	5062	/* consider enabled if no parameter or 1 is given */
5016	if (!s \|\| !strcmp(s, "1"))	5063	if (!(*s) \|\| !strcmp(s, "=1"))
5017	really_do_swap_account = 1;	5064	really_do_swap_account = 1;
5018	else if (!strcmp(s, "0"))	5065	else if (!strcmp(s, "=0"))
5019	really_do_swap_account = 0;	5066	really_do_swap_account = 0;
5020	return 1;	5067	return 1;
5021	}	5068	}
@@ -5023,7 +5070,8 @@ __setup("swapaccount", enable_swap_account);
5023		5070
5024	static int __init disable_swap_account(char *s)	5071	static int __init disable_swap_account(char *s)
5025	{	5072	{
5026	enable_swap_account("0");	5073	printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
		5074	enable_swap_account("=0");
5027	return 1;	5075	return 1;
5028	}	5076	}
5029	__setup("noswapaccount", disable_swap_account);	5077	__setup("noswapaccount", disable_swap_account);