Memory controller: make charging gfp mask aware

Nick Piggin pointed out that swap cache and page cache addition routines could be called from non GFP_KERNEL contexts. This patch makes the charging routine aware of the gfp context. Charging might fail if the cgroup is over it's limit, in which case a suitable error is returned. This patch was tested on a Powerpc box. I am still looking at being able to test the path, through which allocations happen in non GFP_KERNEL contexts. [kamezawa.hiroyu@jp.fujitsu.com: problem with ZONE_MOVABLE] Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Pavel Emelianov <xemul@openvz.org> Cc: Paul Menage <menage@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Kirill Korotaev <dev@sw.ru> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: David Rientjes <rientjes@google.com> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Balbir Singh <balbir@linux.vnet.ibm.com> 2008-02-07 03:14:02 -0500
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2008-02-07 11:42:19 -0500
commit: e1a1cd590e3fcb0d2e230128daf2337ea55387dc (patch)
tree: eb660ab340c657a1eb595b2d4d8e8b62783bf6fb /mm
parent: bed7161a519a2faef53e1bce1b47595e297c1d14 (diff)
7 files changed, 31 insertions, 25 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 8ae171cc2811..63040d5e0ae2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -464,7 +464,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
        if (error == 0) {
-                error = mem_cgroup_cache_charge(page, current->mm);
+                error = mem_cgroup_cache_charge(page, current->mm, gfp_mask);
                if (error)
                        goto out;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ff7cac602984..ac8774426fec 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -261,7 +261,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 * 0 if the charge was successful
 * < 0 if the cgroup is over its limit
 */
-int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
+int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+                                gfp_t gfp_mask)
 {
        struct mem_cgroup *mem;
        struct page_cgroup *pc, *race_pc;
@@ -293,7 +294,7 @@ retry:
        unlock_page_cgroup(page);
-        pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL);
+        pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
        if (pc == NULL)
                goto err;
@@ -320,7 +321,14 @@ retry:
         * the cgroup limit.
         */
        while (res_counter_charge(&mem->res, PAGE_SIZE)) {
-                if (try_to_free_mem_cgroup_pages(mem))
+                bool is_atomic = gfp_mask & GFP_ATOMIC;
+                /*
+                 * We cannot reclaim under GFP_ATOMIC, fail the charge
+                 */
+                if (is_atomic)
+                        goto noreclaim;
+                if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
                        continue;
                /*
@@ -344,9 +352,10 @@ retry:
                        congestion_wait(WRITE, HZ/10);
                        continue;
                }
+noreclaim:
                css_put(&mem->css);
-                mem_cgroup_out_of_memory(mem, GFP_KERNEL);
+                if (!is_atomic)
+                        mem_cgroup_out_of_memory(mem, GFP_KERNEL);
                goto free_pc;
        }
@@ -385,7 +394,8 @@ err:
 /*
 * See if the cached pages should be charged at all?
 */
-int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm)
+int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+                                gfp_t gfp_mask)
 {
        struct mem_cgroup *mem;
        if (!mm)
@@ -393,7 +403,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm)
        mem = rcu_dereference(mm->mem_cgroup);
        if (mem->control_type == MEM_CGROUP_TYPE_ALL)
-                return mem_cgroup_charge(page, mm);
+                return mem_cgroup_charge(page, mm, gfp_mask);
        else
                return 0;
 }
diff --git a/mm/memory.c b/mm/memory.c
index 0ba224ea6ba4..153a54b2013c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1147,7 +1147,7 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa
        pte_t *pte;
        spinlock_t *ptl;
-        retval = mem_cgroup_charge(page, mm);
+        retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
        if (retval)
                goto out;
@@ -1650,7 +1650,7 @@ gotten:
        cow_user_page(new_page, old_page, address, vma);
        __SetPageUptodate(new_page);
-        if (mem_cgroup_charge(new_page, mm))
+        if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
                goto oom_free_new;
        /*
@@ -2052,7 +2052,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(PGMAJFAULT);
        }
-        if (mem_cgroup_charge(page, mm)) {
+        if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
                delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
                ret = VM_FAULT_OOM;
                goto out;
@@ -2139,7 +2139,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto oom;
        __SetPageUptodate(page);
-        if (mem_cgroup_charge(page, mm))
+        if (mem_cgroup_charge(page, mm, GFP_KERNEL))
                goto oom_free_page;
        entry = mk_pte(page, vma->vm_page_prot);
@@ -2277,7 +2277,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        }
-        if (mem_cgroup_charge(page, mm)) {
+        if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
                ret = VM_FAULT_OOM;
                goto out;
        }
diff --git a/mm/migrate.c b/mm/migrate.c
index 417bbda14e5b..763794144697 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -153,7 +153,7 @@ static void remove_migration_pte(struct vm_area_struct *vma,
                return;
        }
-        if (mem_cgroup_charge(new, mm)) {
+        if (mem_cgroup_charge(new, mm, GFP_KERNEL)) {
                pte_unmap(ptep);
                return;
        }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 88258869c8e7..581b609e748d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -78,7 +78,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
        error = radix_tree_preload(gfp_mask);
        if (!error) {
-                error = mem_cgroup_cache_charge(page, current->mm);
+                error = mem_cgroup_cache_charge(page, current->mm, gfp_mask);
                if (error)
                        goto out;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fddc4cc4149b..35e00c3d0286 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -510,7 +510,7 @@ unsigned int count_swap_pages(int type, int free)
 static int unuse_pte(struct vm_area_struct *vma, pte_t *pte,
                unsigned long addr, swp_entry_t entry, struct page *page)
 {
-        if (mem_cgroup_charge(page, vma->vm_mm))
+        if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
                return -ENOMEM;
        inc_mm_counter(vma->vm_mm, anon_rss);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 215f6a726b2f..b7d868cbca09 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1337,16 +1337,11 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
 #ifdef CONFIG_CGROUP_MEM_CONT
-#ifdef CONFIG_HIGHMEM
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
-#define ZONE_USERPAGES ZONE_HIGHMEM
+                                                gfp_t gfp_mask)
-#else
-#define ZONE_USERPAGES ZONE_NORMAL
-#endif
-unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont)
 {
        struct scan_control sc = {
-                .gfp_mask = GFP_KERNEL,
+                .gfp_mask = gfp_mask,
                .may_writepage = !laptop_mode,
                .may_swap = 1,
                .swap_cluster_max = SWAP_CLUSTER_MAX,
@@ -1357,9 +1352,10 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont)
        };
        int node;
        struct zone **zones;
+        int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE);
        for_each_online_node(node) {
-                zones = NODE_DATA(node)->node_zonelists[ZONE_USERPAGES].zones;
+                zones = NODE_DATA(node)->node_zonelists[target_zone].zones;
                if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
                        return 1;
        }
author	Balbir Singh <balbir@linux.vnet.ibm.com>	2008-02-07 03:14:02 -0500
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2008-02-07 11:42:19 -0500
commit	e1a1cd590e3fcb0d2e230128daf2337ea55387dc (patch)
tree	eb660ab340c657a1eb595b2d4d8e8b62783bf6fb /mm
parent	bed7161a519a2faef53e1bce1b47595e297c1d14 (diff)

diff --git a/mm/filemap.c b/mm/filemap.c index 8ae171cc2811..63040d5e0ae2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c
@@ -464,7 +464,7 @@ int add_to_page_cache(struct page page, struct address_space mapping,
464		464
465	if (error == 0) {	465	if (error == 0) {
466		466
467	error = mem_cgroup_cache_charge(page, current->mm);	467	error = mem_cgroup_cache_charge(page, current->mm, gfp_mask);
468	if (error)	468	if (error)
469	goto out;	469	goto out;
470		470


diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ff7cac602984..ac8774426fec 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -261,7 +261,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
261	* 0 if the charge was successful	261	* 0 if the charge was successful
262	* < 0 if the cgroup is over its limit	262	* < 0 if the cgroup is over its limit
263	*/	263	*/
264	int mem_cgroup_charge(struct page page, struct mm_struct mm)	264	int mem_cgroup_charge(struct page page, struct mm_struct mm,
		265	gfp_t gfp_mask)
265	{	266	{
266	struct mem_cgroup *mem;	267	struct mem_cgroup *mem;
267	struct page_cgroup pc, race_pc;	268	struct page_cgroup pc, race_pc;
@@ -293,7 +294,7 @@ retry:
293		294
294	unlock_page_cgroup(page);	295	unlock_page_cgroup(page);
295		296
296	pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL);	297	pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
297	if (pc == NULL)	298	if (pc == NULL)
298	goto err;	299	goto err;
299		300
@@ -320,7 +321,14 @@ retry:
320	* the cgroup limit.	321	* the cgroup limit.
321	*/	322	*/
322	while (res_counter_charge(&mem->res, PAGE_SIZE)) {	323	while (res_counter_charge(&mem->res, PAGE_SIZE)) {
323	if (try_to_free_mem_cgroup_pages(mem))	324	bool is_atomic = gfp_mask & GFP_ATOMIC;
		325	/*
		326	* We cannot reclaim under GFP_ATOMIC, fail the charge
		327	*/
		328	if (is_atomic)
		329	goto noreclaim;
		330
		331	if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
324	continue;	332	continue;
325		333
326	/*	334	/*
@@ -344,9 +352,10 @@ retry:
344	congestion_wait(WRITE, HZ/10);	352	congestion_wait(WRITE, HZ/10);
345	continue;	353	continue;
346	}	354	}
347		355	noreclaim:
348	css_put(&mem->css);	356	css_put(&mem->css);
349	mem_cgroup_out_of_memory(mem, GFP_KERNEL);	357	if (!is_atomic)
		358	mem_cgroup_out_of_memory(mem, GFP_KERNEL);
350	goto free_pc;	359	goto free_pc;
351	}	360	}
352		361
@@ -385,7 +394,8 @@ err:
385	/*	394	/*
386	* See if the cached pages should be charged at all?	395	* See if the cached pages should be charged at all?
387	*/	396	*/
388	int mem_cgroup_cache_charge(struct page page, struct mm_struct mm)	397	int mem_cgroup_cache_charge(struct page page, struct mm_struct mm,
		398	gfp_t gfp_mask)
389	{	399	{
390	struct mem_cgroup *mem;	400	struct mem_cgroup *mem;
391	if (!mm)	401	if (!mm)
@@ -393,7 +403,7 @@ int mem_cgroup_cache_charge(struct page page, struct mm_struct mm)
393		403
394	mem = rcu_dereference(mm->mem_cgroup);	404	mem = rcu_dereference(mm->mem_cgroup);
395	if (mem->control_type == MEM_CGROUP_TYPE_ALL)	405	if (mem->control_type == MEM_CGROUP_TYPE_ALL)
396	return mem_cgroup_charge(page, mm);	406	return mem_cgroup_charge(page, mm, gfp_mask);
397	else	407	else
398	return 0;	408	return 0;
399	}	409	}


diff --git a/mm/memory.c b/mm/memory.c index 0ba224ea6ba4..153a54b2013c 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -1147,7 +1147,7 @@ static int insert_page(struct mm_struct mm, unsigned long addr, struct page pa
1147	pte_t *pte;	1147	pte_t *pte;
1148	spinlock_t *ptl;	1148	spinlock_t *ptl;
1149		1149
1150	retval = mem_cgroup_charge(page, mm);	1150	retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
1151	if (retval)	1151	if (retval)
1152	goto out;	1152	goto out;
1153		1153
@@ -1650,7 +1650,7 @@ gotten:
1650	cow_user_page(new_page, old_page, address, vma);	1650	cow_user_page(new_page, old_page, address, vma);
1651	__SetPageUptodate(new_page);	1651	__SetPageUptodate(new_page);
1652		1652
1653	if (mem_cgroup_charge(new_page, mm))	1653	if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
1654	goto oom_free_new;	1654	goto oom_free_new;
1655		1655
1656	/*	1656	/*
@@ -2052,7 +2052,7 @@ static int do_swap_page(struct mm_struct mm, struct vm_area_struct vma,
2052	count_vm_event(PGMAJFAULT);	2052	count_vm_event(PGMAJFAULT);
2053	}	2053	}
2054		2054
2055	if (mem_cgroup_charge(page, mm)) {	2055	if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
2056	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);	2056	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2057	ret = VM_FAULT_OOM;	2057	ret = VM_FAULT_OOM;
2058	goto out;	2058	goto out;
@@ -2139,7 +2139,7 @@ static int do_anonymous_page(struct mm_struct mm, struct vm_area_struct vma,
2139	goto oom;	2139	goto oom;
2140	__SetPageUptodate(page);	2140	__SetPageUptodate(page);
2141		2141
2142	if (mem_cgroup_charge(page, mm))	2142	if (mem_cgroup_charge(page, mm, GFP_KERNEL))
2143	goto oom_free_page;	2143	goto oom_free_page;
2144		2144
2145	entry = mk_pte(page, vma->vm_page_prot);	2145	entry = mk_pte(page, vma->vm_page_prot);
@@ -2277,7 +2277,7 @@ static int __do_fault(struct mm_struct mm, struct vm_area_struct vma,
2277		2277
2278	}	2278	}
2279		2279
2280	if (mem_cgroup_charge(page, mm)) {	2280	if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
2281	ret = VM_FAULT_OOM;	2281	ret = VM_FAULT_OOM;
2282	goto out;	2282	goto out;
2283	}	2283	}


diff --git a/mm/migrate.c b/mm/migrate.c index 417bbda14e5b..763794144697 100644 --- a/mm/migrate.c +++ b/mm/migrate.c
@@ -153,7 +153,7 @@ static void remove_migration_pte(struct vm_area_struct *vma,
153	return;	153	return;
154	}	154	}
155		155
156	if (mem_cgroup_charge(new, mm)) {	156	if (mem_cgroup_charge(new, mm, GFP_KERNEL)) {
157	pte_unmap(ptep);	157	pte_unmap(ptep);
158	return;	158	return;
159	}	159	}


diff --git a/mm/swap_state.c b/mm/swap_state.c index 88258869c8e7..581b609e748d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c
@@ -78,7 +78,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
78	error = radix_tree_preload(gfp_mask);	78	error = radix_tree_preload(gfp_mask);
79	if (!error) {	79	if (!error) {
80		80
81	error = mem_cgroup_cache_charge(page, current->mm);	81	error = mem_cgroup_cache_charge(page, current->mm, gfp_mask);
82	if (error)	82	if (error)
83	goto out;	83	goto out;
84		84


diff --git a/mm/swapfile.c b/mm/swapfile.c index fddc4cc4149b..35e00c3d0286 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c
@@ -510,7 +510,7 @@ unsigned int count_swap_pages(int type, int free)
510	static int unuse_pte(struct vm_area_struct vma, pte_t pte,	510	static int unuse_pte(struct vm_area_struct vma, pte_t pte,
511	unsigned long addr, swp_entry_t entry, struct page *page)	511	unsigned long addr, swp_entry_t entry, struct page *page)
512	{	512	{
513	if (mem_cgroup_charge(page, vma->vm_mm))	513	if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
514	return -ENOMEM;	514	return -ENOMEM;
515		515
516	inc_mm_counter(vma->vm_mm, anon_rss);	516	inc_mm_counter(vma->vm_mm, anon_rss);


diff --git a/mm/vmscan.c b/mm/vmscan.c index 215f6a726b2f..b7d868cbca09 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -1337,16 +1337,11 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
1337		1337
1338	#ifdef CONFIG_CGROUP_MEM_CONT	1338	#ifdef CONFIG_CGROUP_MEM_CONT
1339		1339
1340	#ifdef CONFIG_HIGHMEM	1340	unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1341	#define ZONE_USERPAGES ZONE_HIGHMEM	1341	gfp_t gfp_mask)
1342	#else
1343	#define ZONE_USERPAGES ZONE_NORMAL
1344	#endif
1345
1346	unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont)
1347	{	1342	{
1348	struct scan_control sc = {	1343	struct scan_control sc = {
1349	.gfp_mask = GFP_KERNEL,	1344	.gfp_mask = gfp_mask,
1350	.may_writepage = !laptop_mode,	1345	.may_writepage = !laptop_mode,
1351	.may_swap = 1,	1346	.may_swap = 1,
1352	.swap_cluster_max = SWAP_CLUSTER_MAX,	1347	.swap_cluster_max = SWAP_CLUSTER_MAX,
@@ -1357,9 +1352,10 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont)
1357	};	1352	};
1358	int node;	1353	int node;
1359	struct zone **zones;	1354	struct zone **zones;
		1355	int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE);
1360		1356
1361	for_each_online_node(node) {	1357	for_each_online_node(node) {
1362	zones = NODE_DATA(node)->node_zonelists[ZONE_USERPAGES].zones;	1358	zones = NODE_DATA(node)->node_zonelists[target_zone].zones;
1363	if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))	1359	if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
1364	return 1;	1360	return 1;
1365	}	1361	}