aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2009-01-07 21:08:00 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-01-08 11:31:05 -0500
commit8c7c6e34a1256a5082d38c8e9bd1474476912715 (patch)
tree09f53c7c4bac5532a9ecbdadb4450702c744ea6f /mm/memcontrol.c
parent27a7faa0779dd13729196c1a818c294f44bbd1ee (diff)
memcg: mem+swap controller core
This patch implements per cgroup limit for usage of memory+swap. However there are SwapCache, double counting of swap-cache and swap-entry is avoided. Mem+Swap controller works as following. - memory usage is limited by memory.limit_in_bytes. - memory + swap usage is limited by memory.memsw_limit_in_bytes. This has following benefits. - A user can limit total resource usage of mem+swap. Without this, because memory resource controller doesn't take care of usage of swap, a process can exhaust all the swap (by memory leak.) We can avoid this case. And Swap is shared resource but it cannot be reclaimed (goes back to memory) until it's used. This characteristic can be trouble when the memory is divided into some parts by cpuset or memcg. Assume group A and group B. After some application executes, the system can be.. Group A -- very large free memory space but occupy 99% of swap. Group B -- under memory shortage but cannot use swap...it's nearly full. Ability to set appropriate swap limit for each group is required. Maybe someone wonder "why not swap but mem+swap ?" - The global LRU(kswapd) can swap out arbitrary pages. Swap-out means to move account from memory to swap...there is no change in usage of mem+swap. In other words, when we want to limit the usage of swap without affecting global LRU, mem+swap limit is better than just limiting swap. Accounting target information is stored in swap_cgroup which is per swap entry record. Charge is done as following. map - charge page and memsw. unmap - uncharge page/memsw if not SwapCache. swap-out (__delete_from_swap_cache) - uncharge page - record mem_cgroup information to swap_cgroup. swap-in (do_swap_page) - charged as page and memsw. record in swap_cgroup is cleared. memsw accounting is decremented. swap-free (swap_free()) - if swap entry is freed, memsw is uncharged by PAGE_SIZE. There are people work under never-swap environments and consider swap as something bad. For such people, this mem+swap controller extension is just an overhead. This overhead is avoided by config or boot option. (see Kconfig. detail is not in this patch.) TODO: - maybe more optimization can be don in swap-in path. (but not very safe.) But we just do simple accounting at this stage. [nishimura@mxp.nes.nec.co.jp: make resize limit hold mutex] [hugh@veritas.com: memswap controller core swapcache fixes] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Pavel Emelyanov <xemul@openvz.org> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c400
1 files changed, 362 insertions, 38 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 59dd8c116372..2efcf38f3b73 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -27,6 +27,7 @@
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/bit_spinlock.h> 28#include <linux/bit_spinlock.h>
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/mutex.h>
30#include <linux/slab.h> 31#include <linux/slab.h>
31#include <linux/swap.h> 32#include <linux/swap.h>
32#include <linux/spinlock.h> 33#include <linux/spinlock.h>
@@ -132,12 +133,18 @@ struct mem_cgroup {
132 */ 133 */
133 struct res_counter res; 134 struct res_counter res;
134 /* 135 /*
136 * the counter to account for mem+swap usage.
137 */
138 struct res_counter memsw;
139 /*
135 * Per cgroup active and inactive list, similar to the 140 * Per cgroup active and inactive list, similar to the
136 * per zone LRU lists. 141 * per zone LRU lists.
137 */ 142 */
138 struct mem_cgroup_lru_info info; 143 struct mem_cgroup_lru_info info;
139 144
140 int prev_priority; /* for recording reclaim priority */ 145 int prev_priority; /* for recording reclaim priority */
146 int obsolete;
147 atomic_t refcnt;
141 /* 148 /*
142 * statistics. This must be placed at the end of memcg. 149 * statistics. This must be placed at the end of memcg.
143 */ 150 */
@@ -167,6 +174,17 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
167 0, /* FORCE */ 174 0, /* FORCE */
168}; 175};
169 176
177
178/* for encoding cft->private value on file */
179#define _MEM (0)
180#define _MEMSWAP (1)
181#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
182#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
183#define MEMFILE_ATTR(val) ((val) & 0xffff)
184
185static void mem_cgroup_get(struct mem_cgroup *mem);
186static void mem_cgroup_put(struct mem_cgroup *mem);
187
170/* 188/*
171 * Always modified under lru lock. Then, not necessary to preempt_disable() 189 * Always modified under lru lock. Then, not necessary to preempt_disable()
172 */ 190 */
@@ -485,7 +503,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
485 * oom-killer can be invoked. 503 * oom-killer can be invoked.
486 */ 504 */
487static int __mem_cgroup_try_charge(struct mm_struct *mm, 505static int __mem_cgroup_try_charge(struct mm_struct *mm,
488 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) 506 gfp_t gfp_mask, struct mem_cgroup **memcg,
507 bool oom)
489{ 508{
490 struct mem_cgroup *mem; 509 struct mem_cgroup *mem;
491 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 510 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
@@ -513,12 +532,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
513 css_get(&mem->css); 532 css_get(&mem->css);
514 } 533 }
515 534
535 while (1) {
536 int ret;
537 bool noswap = false;
516 538
517 while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { 539 ret = res_counter_charge(&mem->res, PAGE_SIZE);
540 if (likely(!ret)) {
541 if (!do_swap_account)
542 break;
543 ret = res_counter_charge(&mem->memsw, PAGE_SIZE);
544 if (likely(!ret))
545 break;
546 /* mem+swap counter fails */
547 res_counter_uncharge(&mem->res, PAGE_SIZE);
548 noswap = true;
549 }
518 if (!(gfp_mask & __GFP_WAIT)) 550 if (!(gfp_mask & __GFP_WAIT))
519 goto nomem; 551 goto nomem;
520 552
521 if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) 553 if (try_to_free_mem_cgroup_pages(mem, gfp_mask, noswap))
522 continue; 554 continue;
523 555
524 /* 556 /*
@@ -527,8 +559,13 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
527 * moved to swap cache or just unmapped from the cgroup. 559 * moved to swap cache or just unmapped from the cgroup.
528 * Check the limit again to see if the reclaim reduced the 560 * Check the limit again to see if the reclaim reduced the
529 * current usage of the cgroup before giving up 561 * current usage of the cgroup before giving up
562 *
530 */ 563 */
531 if (res_counter_check_under_limit(&mem->res)) 564 if (!do_swap_account &&
565 res_counter_check_under_limit(&mem->res))
566 continue;
567 if (do_swap_account &&
568 res_counter_check_under_limit(&mem->memsw))
532 continue; 569 continue;
533 570
534 if (!nr_retries--) { 571 if (!nr_retries--) {
@@ -582,6 +619,8 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
582 if (unlikely(PageCgroupUsed(pc))) { 619 if (unlikely(PageCgroupUsed(pc))) {
583 unlock_page_cgroup(pc); 620 unlock_page_cgroup(pc);
584 res_counter_uncharge(&mem->res, PAGE_SIZE); 621 res_counter_uncharge(&mem->res, PAGE_SIZE);
622 if (do_swap_account)
623 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
585 css_put(&mem->css); 624 css_put(&mem->css);
586 return; 625 return;
587 } 626 }
@@ -646,6 +685,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
646 __mem_cgroup_remove_list(from_mz, pc); 685 __mem_cgroup_remove_list(from_mz, pc);
647 css_put(&from->css); 686 css_put(&from->css);
648 res_counter_uncharge(&from->res, PAGE_SIZE); 687 res_counter_uncharge(&from->res, PAGE_SIZE);
688 if (do_swap_account)
689 res_counter_uncharge(&from->memsw, PAGE_SIZE);
649 pc->mem_cgroup = to; 690 pc->mem_cgroup = to;
650 css_get(&to->css); 691 css_get(&to->css);
651 __mem_cgroup_add_list(to_mz, pc, false); 692 __mem_cgroup_add_list(to_mz, pc, false);
@@ -692,8 +733,11 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
692 /* drop extra refcnt */ 733 /* drop extra refcnt */
693 css_put(&parent->css); 734 css_put(&parent->css);
694 /* uncharge if move fails */ 735 /* uncharge if move fails */
695 if (ret) 736 if (ret) {
696 res_counter_uncharge(&parent->res, PAGE_SIZE); 737 res_counter_uncharge(&parent->res, PAGE_SIZE);
738 if (do_swap_account)
739 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
740 }
697 741
698 return ret; 742 return ret;
699} 743}
@@ -791,7 +835,42 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
791 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); 835 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
792} 836}
793 837
838int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
839 struct page *page,
840 gfp_t mask, struct mem_cgroup **ptr)
841{
842 struct mem_cgroup *mem;
843 swp_entry_t ent;
844
845 if (mem_cgroup_subsys.disabled)
846 return 0;
847
848 if (!do_swap_account)
849 goto charge_cur_mm;
850
851 /*
852 * A racing thread's fault, or swapoff, may have already updated
853 * the pte, and even removed page from swap cache: return success
854 * to go on to do_swap_page()'s pte_same() test, which should fail.
855 */
856 if (!PageSwapCache(page))
857 return 0;
858
859 ent.val = page_private(page);
860
861 mem = lookup_swap_cgroup(ent);
862 if (!mem || mem->obsolete)
863 goto charge_cur_mm;
864 *ptr = mem;
865 return __mem_cgroup_try_charge(NULL, mask, ptr, true);
866charge_cur_mm:
867 if (unlikely(!mm))
868 mm = &init_mm;
869 return __mem_cgroup_try_charge(mm, mask, ptr, true);
870}
871
794#ifdef CONFIG_SWAP 872#ifdef CONFIG_SWAP
873
795int mem_cgroup_cache_charge_swapin(struct page *page, 874int mem_cgroup_cache_charge_swapin(struct page *page,
796 struct mm_struct *mm, gfp_t mask, bool locked) 875 struct mm_struct *mm, gfp_t mask, bool locked)
797{ 876{
@@ -808,8 +887,28 @@ int mem_cgroup_cache_charge_swapin(struct page *page,
808 * we reach here. 887 * we reach here.
809 */ 888 */
810 if (PageSwapCache(page)) { 889 if (PageSwapCache(page)) {
890 struct mem_cgroup *mem = NULL;
891 swp_entry_t ent;
892
893 ent.val = page_private(page);
894 if (do_swap_account) {
895 mem = lookup_swap_cgroup(ent);
896 if (mem && mem->obsolete)
897 mem = NULL;
898 if (mem)
899 mm = NULL;
900 }
811 ret = mem_cgroup_charge_common(page, mm, mask, 901 ret = mem_cgroup_charge_common(page, mm, mask,
812 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); 902 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
903
904 if (!ret && do_swap_account) {
905 /* avoid double counting */
906 mem = swap_cgroup_record(ent, NULL);
907 if (mem) {
908 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
909 mem_cgroup_put(mem);
910 }
911 }
813 } 912 }
814 if (!locked) 913 if (!locked)
815 unlock_page(page); 914 unlock_page(page);
@@ -828,6 +927,23 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
828 return; 927 return;
829 pc = lookup_page_cgroup(page); 928 pc = lookup_page_cgroup(page);
830 __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); 929 __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
930 /*
931 * Now swap is on-memory. This means this page may be
932 * counted both as mem and swap....double count.
933 * Fix it by uncharging from memsw. This SwapCache is stable
934 * because we're still under lock_page().
935 */
936 if (do_swap_account) {
937 swp_entry_t ent = {.val = page_private(page)};
938 struct mem_cgroup *memcg;
939 memcg = swap_cgroup_record(ent, NULL);
940 if (memcg) {
941 /* If memcg is obsolete, memcg can be != ptr */
942 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
943 mem_cgroup_put(memcg);
944 }
945
946 }
831} 947}
832 948
833void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 949void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
@@ -837,6 +953,8 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
837 if (!mem) 953 if (!mem)
838 return; 954 return;
839 res_counter_uncharge(&mem->res, PAGE_SIZE); 955 res_counter_uncharge(&mem->res, PAGE_SIZE);
956 if (do_swap_account)
957 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
840 css_put(&mem->css); 958 css_put(&mem->css);
841} 959}
842 960
@@ -844,29 +962,31 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
844/* 962/*
845 * uncharge if !page_mapped(page) 963 * uncharge if !page_mapped(page)
846 */ 964 */
847static void 965static struct mem_cgroup *
848__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 966__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
849{ 967{
850 struct page_cgroup *pc; 968 struct page_cgroup *pc;
851 struct mem_cgroup *mem; 969 struct mem_cgroup *mem = NULL;
852 struct mem_cgroup_per_zone *mz; 970 struct mem_cgroup_per_zone *mz;
853 unsigned long flags; 971 unsigned long flags;
854 972
855 if (mem_cgroup_subsys.disabled) 973 if (mem_cgroup_subsys.disabled)
856 return; 974 return NULL;
857 975
858 if (PageSwapCache(page)) 976 if (PageSwapCache(page))
859 return; 977 return NULL;
860 978
861 /* 979 /*
862 * Check if our page_cgroup is valid 980 * Check if our page_cgroup is valid
863 */ 981 */
864 pc = lookup_page_cgroup(page); 982 pc = lookup_page_cgroup(page);
865 if (unlikely(!pc || !PageCgroupUsed(pc))) 983 if (unlikely(!pc || !PageCgroupUsed(pc)))
866 return; 984 return NULL;
867 985
868 lock_page_cgroup(pc); 986 lock_page_cgroup(pc);
869 987
988 mem = pc->mem_cgroup;
989
870 if (!PageCgroupUsed(pc)) 990 if (!PageCgroupUsed(pc))
871 goto unlock_out; 991 goto unlock_out;
872 992
@@ -886,8 +1006,11 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
886 break; 1006 break;
887 } 1007 }
888 1008
1009 res_counter_uncharge(&mem->res, PAGE_SIZE);
1010 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1011 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1012
889 ClearPageCgroupUsed(pc); 1013 ClearPageCgroupUsed(pc);
890 mem = pc->mem_cgroup;
891 1014
892 mz = page_cgroup_zoneinfo(pc); 1015 mz = page_cgroup_zoneinfo(pc);
893 spin_lock_irqsave(&mz->lru_lock, flags); 1016 spin_lock_irqsave(&mz->lru_lock, flags);
@@ -895,14 +1018,13 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
895 spin_unlock_irqrestore(&mz->lru_lock, flags); 1018 spin_unlock_irqrestore(&mz->lru_lock, flags);
896 unlock_page_cgroup(pc); 1019 unlock_page_cgroup(pc);
897 1020
898 res_counter_uncharge(&mem->res, PAGE_SIZE);
899 css_put(&mem->css); 1021 css_put(&mem->css);
900 1022
901 return; 1023 return mem;
902 1024
903unlock_out: 1025unlock_out:
904 unlock_page_cgroup(pc); 1026 unlock_page_cgroup(pc);
905 return; 1027 return NULL;
906} 1028}
907 1029
908void mem_cgroup_uncharge_page(struct page *page) 1030void mem_cgroup_uncharge_page(struct page *page)
@@ -922,10 +1044,42 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
922 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 1044 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
923} 1045}
924 1046
925void mem_cgroup_uncharge_swapcache(struct page *page) 1047/*
1048 * called from __delete_from_swap_cache() and drop "page" account.
1049 * memcg information is recorded to swap_cgroup of "ent"
1050 */
1051void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1052{
1053 struct mem_cgroup *memcg;
1054
1055 memcg = __mem_cgroup_uncharge_common(page,
1056 MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
1057 /* record memcg information */
1058 if (do_swap_account && memcg) {
1059 swap_cgroup_record(ent, memcg);
1060 mem_cgroup_get(memcg);
1061 }
1062}
1063
1064#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1065/*
1066 * called from swap_entry_free(). remove record in swap_cgroup and
1067 * uncharge "memsw" account.
1068 */
1069void mem_cgroup_uncharge_swap(swp_entry_t ent)
926{ 1070{
927 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_SWAPOUT); 1071 struct mem_cgroup *memcg;
1072
1073 if (!do_swap_account)
1074 return;
1075
1076 memcg = swap_cgroup_record(ent, NULL);
1077 if (memcg) {
1078 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1079 mem_cgroup_put(memcg);
1080 }
928} 1081}
1082#endif
929 1083
930/* 1084/*
931 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 1085 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
@@ -1034,7 +1188,7 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
1034 rcu_read_unlock(); 1188 rcu_read_unlock();
1035 1189
1036 do { 1190 do {
1037 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); 1191 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true);
1038 progress += res_counter_check_under_limit(&mem->res); 1192 progress += res_counter_check_under_limit(&mem->res);
1039 } while (!progress && --retry); 1193 } while (!progress && --retry);
1040 1194
@@ -1044,26 +1198,84 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
1044 return 0; 1198 return 0;
1045} 1199}
1046 1200
1201static DEFINE_MUTEX(set_limit_mutex);
1202
1047static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 1203static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1048 unsigned long long val) 1204 unsigned long long val)
1049{ 1205{
1050 1206
1051 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 1207 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1052 int progress; 1208 int progress;
1209 u64 memswlimit;
1053 int ret = 0; 1210 int ret = 0;
1054 1211
1055 while (res_counter_set_limit(&memcg->res, val)) { 1212 while (retry_count) {
1056 if (signal_pending(current)) { 1213 if (signal_pending(current)) {
1057 ret = -EINTR; 1214 ret = -EINTR;
1058 break; 1215 break;
1059 } 1216 }
1060 if (!retry_count) { 1217 /*
1061 ret = -EBUSY; 1218 * Rather than hide all in some function, I do this in
1219 * open coded manner. You see what this really does.
1220 * We have to guarantee mem->res.limit < mem->memsw.limit.
1221 */
1222 mutex_lock(&set_limit_mutex);
1223 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1224 if (memswlimit < val) {
1225 ret = -EINVAL;
1226 mutex_unlock(&set_limit_mutex);
1062 break; 1227 break;
1063 } 1228 }
1229 ret = res_counter_set_limit(&memcg->res, val);
1230 mutex_unlock(&set_limit_mutex);
1231
1232 if (!ret)
1233 break;
1234
1064 progress = try_to_free_mem_cgroup_pages(memcg, 1235 progress = try_to_free_mem_cgroup_pages(memcg,
1065 GFP_HIGHUSER_MOVABLE); 1236 GFP_HIGHUSER_MOVABLE, false);
1066 if (!progress) 1237 if (!progress) retry_count--;
1238 }
1239 return ret;
1240}
1241
1242int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1243 unsigned long long val)
1244{
1245 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1246 u64 memlimit, oldusage, curusage;
1247 int ret;
1248
1249 if (!do_swap_account)
1250 return -EINVAL;
1251
1252 while (retry_count) {
1253 if (signal_pending(current)) {
1254 ret = -EINTR;
1255 break;
1256 }
1257 /*
1258 * Rather than hide all in some function, I do this in
1259 * open coded manner. You see what this really does.
1260 * We have to guarantee mem->res.limit < mem->memsw.limit.
1261 */
1262 mutex_lock(&set_limit_mutex);
1263 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1264 if (memlimit > val) {
1265 ret = -EINVAL;
1266 mutex_unlock(&set_limit_mutex);
1267 break;
1268 }
1269 ret = res_counter_set_limit(&memcg->memsw, val);
1270 mutex_unlock(&set_limit_mutex);
1271
1272 if (!ret)
1273 break;
1274
1275 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1276 try_to_free_mem_cgroup_pages(memcg, GFP_HIGHUSER_MOVABLE, true);
1277 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1278 if (curusage >= oldusage)
1067 retry_count--; 1279 retry_count--;
1068 } 1280 }
1069 return ret; 1281 return ret;
@@ -1193,7 +1405,7 @@ try_to_free:
1193 goto out; 1405 goto out;
1194 } 1406 }
1195 progress = try_to_free_mem_cgroup_pages(mem, 1407 progress = try_to_free_mem_cgroup_pages(mem,
1196 GFP_HIGHUSER_MOVABLE); 1408 GFP_HIGHUSER_MOVABLE, false);
1197 if (!progress) { 1409 if (!progress) {
1198 nr_retries--; 1410 nr_retries--;
1199 /* maybe some writeback is necessary */ 1411 /* maybe some writeback is necessary */
@@ -1216,8 +1428,25 @@ int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
1216 1428
1217static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 1429static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
1218{ 1430{
1219 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, 1431 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1220 cft->private); 1432 u64 val = 0;
1433 int type, name;
1434
1435 type = MEMFILE_TYPE(cft->private);
1436 name = MEMFILE_ATTR(cft->private);
1437 switch (type) {
1438 case _MEM:
1439 val = res_counter_read_u64(&mem->res, name);
1440 break;
1441 case _MEMSWAP:
1442 if (do_swap_account)
1443 val = res_counter_read_u64(&mem->memsw, name);
1444 break;
1445 default:
1446 BUG();
1447 break;
1448 }
1449 return val;
1221} 1450}
1222/* 1451/*
1223 * The user of this function is... 1452 * The user of this function is...
@@ -1227,15 +1456,22 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
1227 const char *buffer) 1456 const char *buffer)
1228{ 1457{
1229 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 1458 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
1459 int type, name;
1230 unsigned long long val; 1460 unsigned long long val;
1231 int ret; 1461 int ret;
1232 1462
1233 switch (cft->private) { 1463 type = MEMFILE_TYPE(cft->private);
1464 name = MEMFILE_ATTR(cft->private);
1465 switch (name) {
1234 case RES_LIMIT: 1466 case RES_LIMIT:
1235 /* This function does all necessary parse...reuse it */ 1467 /* This function does all necessary parse...reuse it */
1236 ret = res_counter_memparse_write_strategy(buffer, &val); 1468 ret = res_counter_memparse_write_strategy(buffer, &val);
1237 if (!ret) 1469 if (ret)
1470 break;
1471 if (type == _MEM)
1238 ret = mem_cgroup_resize_limit(memcg, val); 1472 ret = mem_cgroup_resize_limit(memcg, val);
1473 else
1474 ret = mem_cgroup_resize_memsw_limit(memcg, val);
1239 break; 1475 break;
1240 default: 1476 default:
1241 ret = -EINVAL; /* should be BUG() ? */ 1477 ret = -EINVAL; /* should be BUG() ? */
@@ -1247,14 +1483,23 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
1247static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 1483static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1248{ 1484{
1249 struct mem_cgroup *mem; 1485 struct mem_cgroup *mem;
1486 int type, name;
1250 1487
1251 mem = mem_cgroup_from_cont(cont); 1488 mem = mem_cgroup_from_cont(cont);
1252 switch (event) { 1489 type = MEMFILE_TYPE(event);
1490 name = MEMFILE_ATTR(event);
1491 switch (name) {
1253 case RES_MAX_USAGE: 1492 case RES_MAX_USAGE:
1254 res_counter_reset_max(&mem->res); 1493 if (type == _MEM)
1494 res_counter_reset_max(&mem->res);
1495 else
1496 res_counter_reset_max(&mem->memsw);
1255 break; 1497 break;
1256 case RES_FAILCNT: 1498 case RES_FAILCNT:
1257 res_counter_reset_failcnt(&mem->res); 1499 if (type == _MEM)
1500 res_counter_reset_failcnt(&mem->res);
1501 else
1502 res_counter_reset_failcnt(&mem->memsw);
1258 break; 1503 break;
1259 } 1504 }
1260 return 0; 1505 return 0;
@@ -1315,24 +1560,24 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1315static struct cftype mem_cgroup_files[] = { 1560static struct cftype mem_cgroup_files[] = {
1316 { 1561 {
1317 .name = "usage_in_bytes", 1562 .name = "usage_in_bytes",
1318 .private = RES_USAGE, 1563 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
1319 .read_u64 = mem_cgroup_read, 1564 .read_u64 = mem_cgroup_read,
1320 }, 1565 },
1321 { 1566 {
1322 .name = "max_usage_in_bytes", 1567 .name = "max_usage_in_bytes",
1323 .private = RES_MAX_USAGE, 1568 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
1324 .trigger = mem_cgroup_reset, 1569 .trigger = mem_cgroup_reset,
1325 .read_u64 = mem_cgroup_read, 1570 .read_u64 = mem_cgroup_read,
1326 }, 1571 },
1327 { 1572 {
1328 .name = "limit_in_bytes", 1573 .name = "limit_in_bytes",
1329 .private = RES_LIMIT, 1574 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
1330 .write_string = mem_cgroup_write, 1575 .write_string = mem_cgroup_write,
1331 .read_u64 = mem_cgroup_read, 1576 .read_u64 = mem_cgroup_read,
1332 }, 1577 },
1333 { 1578 {
1334 .name = "failcnt", 1579 .name = "failcnt",
1335 .private = RES_FAILCNT, 1580 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
1336 .trigger = mem_cgroup_reset, 1581 .trigger = mem_cgroup_reset,
1337 .read_u64 = mem_cgroup_read, 1582 .read_u64 = mem_cgroup_read,
1338 }, 1583 },
@@ -1346,6 +1591,47 @@ static struct cftype mem_cgroup_files[] = {
1346 }, 1591 },
1347}; 1592};
1348 1593
1594#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1595static struct cftype memsw_cgroup_files[] = {
1596 {
1597 .name = "memsw.usage_in_bytes",
1598 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
1599 .read_u64 = mem_cgroup_read,
1600 },
1601 {
1602 .name = "memsw.max_usage_in_bytes",
1603 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
1604 .trigger = mem_cgroup_reset,
1605 .read_u64 = mem_cgroup_read,
1606 },
1607 {
1608 .name = "memsw.limit_in_bytes",
1609 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
1610 .write_string = mem_cgroup_write,
1611 .read_u64 = mem_cgroup_read,
1612 },
1613 {
1614 .name = "memsw.failcnt",
1615 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
1616 .trigger = mem_cgroup_reset,
1617 .read_u64 = mem_cgroup_read,
1618 },
1619};
1620
1621static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
1622{
1623 if (!do_swap_account)
1624 return 0;
1625 return cgroup_add_files(cont, ss, memsw_cgroup_files,
1626 ARRAY_SIZE(memsw_cgroup_files));
1627};
1628#else
1629static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
1630{
1631 return 0;
1632}
1633#endif
1634
1349static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 1635static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1350{ 1636{
1351 struct mem_cgroup_per_node *pn; 1637 struct mem_cgroup_per_node *pn;
@@ -1404,14 +1690,44 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
1404 return mem; 1690 return mem;
1405} 1691}
1406 1692
1693/*
1694 * At destroying mem_cgroup, references from swap_cgroup can remain.
1695 * (scanning all at force_empty is too costly...)
1696 *
1697 * Instead of clearing all references at force_empty, we remember
1698 * the number of reference from swap_cgroup and free mem_cgroup when
1699 * it goes down to 0.
1700 *
1701 * When mem_cgroup is destroyed, mem->obsolete will be set to 0 and
1702 * entry which points to this memcg will be ignore at swapin.
1703 *
1704 * Removal of cgroup itself succeeds regardless of refs from swap.
1705 */
1706
1407static void mem_cgroup_free(struct mem_cgroup *mem) 1707static void mem_cgroup_free(struct mem_cgroup *mem)
1408{ 1708{
1709 if (atomic_read(&mem->refcnt) > 0)
1710 return;
1409 if (mem_cgroup_size() < PAGE_SIZE) 1711 if (mem_cgroup_size() < PAGE_SIZE)
1410 kfree(mem); 1712 kfree(mem);
1411 else 1713 else
1412 vfree(mem); 1714 vfree(mem);
1413} 1715}
1414 1716
1717static void mem_cgroup_get(struct mem_cgroup *mem)
1718{
1719 atomic_inc(&mem->refcnt);
1720}
1721
1722static void mem_cgroup_put(struct mem_cgroup *mem)
1723{
1724 if (atomic_dec_and_test(&mem->refcnt)) {
1725 if (!mem->obsolete)
1726 return;
1727 mem_cgroup_free(mem);
1728 }
1729}
1730
1415 1731
1416#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 1732#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1417static void __init enable_swap_cgroup(void) 1733static void __init enable_swap_cgroup(void)
@@ -1436,6 +1752,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1436 return ERR_PTR(-ENOMEM); 1752 return ERR_PTR(-ENOMEM);
1437 1753
1438 res_counter_init(&mem->res); 1754 res_counter_init(&mem->res);
1755 res_counter_init(&mem->memsw);
1439 1756
1440 for_each_node_state(node, N_POSSIBLE) 1757 for_each_node_state(node, N_POSSIBLE)
1441 if (alloc_mem_cgroup_per_zone_info(mem, node)) 1758 if (alloc_mem_cgroup_per_zone_info(mem, node))
@@ -1456,6 +1773,7 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
1456 struct cgroup *cont) 1773 struct cgroup *cont)
1457{ 1774{
1458 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1775 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1776 mem->obsolete = 1;
1459 mem_cgroup_force_empty(mem, false); 1777 mem_cgroup_force_empty(mem, false);
1460} 1778}
1461 1779
@@ -1474,8 +1792,14 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1474static int mem_cgroup_populate(struct cgroup_subsys *ss, 1792static int mem_cgroup_populate(struct cgroup_subsys *ss,
1475 struct cgroup *cont) 1793 struct cgroup *cont)
1476{ 1794{
1477 return cgroup_add_files(cont, ss, mem_cgroup_files, 1795 int ret;
1478 ARRAY_SIZE(mem_cgroup_files)); 1796
1797 ret = cgroup_add_files(cont, ss, mem_cgroup_files,
1798 ARRAY_SIZE(mem_cgroup_files));
1799
1800 if (!ret)
1801 ret = register_memsw_files(cont, ss);
1802 return ret;
1479} 1803}
1480 1804
1481static void mem_cgroup_move_task(struct cgroup_subsys *ss, 1805static void mem_cgroup_move_task(struct cgroup_subsys *ss,