aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2009-01-07 21:08:00 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-01-08 11:31:05 -0500
commit8c7c6e34a1256a5082d38c8e9bd1474476912715 (patch)
tree09f53c7c4bac5532a9ecbdadb4450702c744ea6f /mm/memory.c
parent27a7faa0779dd13729196c1a818c294f44bbd1ee (diff)
memcg: mem+swap controller core
This patch implements per cgroup limit for usage of memory+swap. However there are SwapCache, double counting of swap-cache and swap-entry is avoided. Mem+Swap controller works as following. - memory usage is limited by memory.limit_in_bytes. - memory + swap usage is limited by memory.memsw_limit_in_bytes. This has following benefits. - A user can limit total resource usage of mem+swap. Without this, because memory resource controller doesn't take care of usage of swap, a process can exhaust all the swap (by memory leak.) We can avoid this case. And Swap is shared resource but it cannot be reclaimed (goes back to memory) until it's used. This characteristic can be trouble when the memory is divided into some parts by cpuset or memcg. Assume group A and group B. After some application executes, the system can be.. Group A -- very large free memory space but occupy 99% of swap. Group B -- under memory shortage but cannot use swap...it's nearly full. Ability to set appropriate swap limit for each group is required. Maybe someone wonder "why not swap but mem+swap ?" - The global LRU(kswapd) can swap out arbitrary pages. Swap-out means to move account from memory to swap...there is no change in usage of mem+swap. In other words, when we want to limit the usage of swap without affecting global LRU, mem+swap limit is better than just limiting swap. Accounting target information is stored in swap_cgroup which is per swap entry record. Charge is done as following. map - charge page and memsw. unmap - uncharge page/memsw if not SwapCache. swap-out (__delete_from_swap_cache) - uncharge page - record mem_cgroup information to swap_cgroup. swap-in (do_swap_page) - charged as page and memsw. record in swap_cgroup is cleared. memsw accounting is decremented. swap-free (swap_free()) - if swap entry is freed, memsw is uncharged by PAGE_SIZE. There are people work under never-swap environments and consider swap as something bad. For such people, this mem+swap controller extension is just an overhead. This overhead is avoided by config or boot option. (see Kconfig. detail is not in this patch.) TODO: - maybe more optimization can be don in swap-in path. (but not very safe.) But we just do simple accounting at this stage. [nishimura@mxp.nes.nec.co.jp: make resize limit hold mutex] [hugh@veritas.com: memswap controller core swapcache fixes] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Pavel Emelyanov <xemul@openvz.org> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c18
1 files changed, 15 insertions, 3 deletions
diff --git a/mm/memory.c b/mm/memory.c
index ba5189e322e6..1358012ffa73 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2431,7 +2431,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2431 lock_page(page); 2431 lock_page(page);
2432 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2432 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2433 2433
2434 if (mem_cgroup_try_charge(mm, GFP_HIGHUSER_MOVABLE, &ptr) == -ENOMEM) { 2434 if (mem_cgroup_try_charge_swapin(mm, page,
2435 GFP_HIGHUSER_MOVABLE, &ptr) == -ENOMEM) {
2435 ret = VM_FAULT_OOM; 2436 ret = VM_FAULT_OOM;
2436 unlock_page(page); 2437 unlock_page(page);
2437 goto out; 2438 goto out;
@@ -2449,8 +2450,20 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2449 goto out_nomap; 2450 goto out_nomap;
2450 } 2451 }
2451 2452
2452 /* The page isn't present yet, go ahead with the fault. */ 2453 /*
2454 * The page isn't present yet, go ahead with the fault.
2455 *
2456 * Be careful about the sequence of operations here.
2457 * To get its accounting right, reuse_swap_page() must be called
2458 * while the page is counted on swap but not yet in mapcount i.e.
2459 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
2460 * must be called after the swap_free(), or it will never succeed.
2461 * And mem_cgroup_commit_charge_swapin(), which uses the swp_entry
2462 * in page->private, must be called before reuse_swap_page(),
2463 * which may delete_from_swap_cache().
2464 */
2453 2465
2466 mem_cgroup_commit_charge_swapin(page, ptr);
2454 inc_mm_counter(mm, anon_rss); 2467 inc_mm_counter(mm, anon_rss);
2455 pte = mk_pte(page, vma->vm_page_prot); 2468 pte = mk_pte(page, vma->vm_page_prot);
2456 if (write_access && reuse_swap_page(page)) { 2469 if (write_access && reuse_swap_page(page)) {
@@ -2461,7 +2474,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2461 flush_icache_page(vma, page); 2474 flush_icache_page(vma, page);
2462 set_pte_at(mm, address, page_table, pte); 2475 set_pte_at(mm, address, page_table, pte);
2463 page_add_anon_rmap(page, vma, address); 2476 page_add_anon_rmap(page, vma, address);
2464 mem_cgroup_commit_charge_swapin(page, ptr);
2465 2477
2466 swap_free(entry); 2478 swap_free(entry);
2467 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 2479 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))