aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c400
-rw-r--r--mm/memory.c18
-rw-r--r--mm/swap_state.c5
-rw-r--r--mm/swapfile.c11
-rw-r--r--mm/vmscan.c6
5 files changed, 393 insertions, 47 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 59dd8c116372..2efcf38f3b73 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -27,6 +27,7 @@
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/bit_spinlock.h> 28#include <linux/bit_spinlock.h>
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/mutex.h>
30#include <linux/slab.h> 31#include <linux/slab.h>
31#include <linux/swap.h> 32#include <linux/swap.h>
32#include <linux/spinlock.h> 33#include <linux/spinlock.h>
@@ -132,12 +133,18 @@ struct mem_cgroup {
132 */ 133 */
133 struct res_counter res; 134 struct res_counter res;
134 /* 135 /*
136 * the counter to account for mem+swap usage.
137 */
138 struct res_counter memsw;
139 /*
135 * Per cgroup active and inactive list, similar to the 140 * Per cgroup active and inactive list, similar to the
136 * per zone LRU lists. 141 * per zone LRU lists.
137 */ 142 */
138 struct mem_cgroup_lru_info info; 143 struct mem_cgroup_lru_info info;
139 144
140 int prev_priority; /* for recording reclaim priority */ 145 int prev_priority; /* for recording reclaim priority */
146 int obsolete;
147 atomic_t refcnt;
141 /* 148 /*
142 * statistics. This must be placed at the end of memcg. 149 * statistics. This must be placed at the end of memcg.
143 */ 150 */
@@ -167,6 +174,17 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
167 0, /* FORCE */ 174 0, /* FORCE */
168}; 175};
169 176
177
178/* for encoding cft->private value on file */
179#define _MEM (0)
180#define _MEMSWAP (1)
181#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
182#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
183#define MEMFILE_ATTR(val) ((val) & 0xffff)
184
185static void mem_cgroup_get(struct mem_cgroup *mem);
186static void mem_cgroup_put(struct mem_cgroup *mem);
187
170/* 188/*
171 * Always modified under lru lock. Then, not necessary to preempt_disable() 189 * Always modified under lru lock. Then, not necessary to preempt_disable()
172 */ 190 */
@@ -485,7 +503,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
485 * oom-killer can be invoked. 503 * oom-killer can be invoked.
486 */ 504 */
487static int __mem_cgroup_try_charge(struct mm_struct *mm, 505static int __mem_cgroup_try_charge(struct mm_struct *mm,
488 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) 506 gfp_t gfp_mask, struct mem_cgroup **memcg,
507 bool oom)
489{ 508{
490 struct mem_cgroup *mem; 509 struct mem_cgroup *mem;
491 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 510 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
@@ -513,12 +532,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
513 css_get(&mem->css); 532 css_get(&mem->css);
514 } 533 }
515 534
535 while (1) {
536 int ret;
537 bool noswap = false;
516 538
517 while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { 539 ret = res_counter_charge(&mem->res, PAGE_SIZE);
540 if (likely(!ret)) {
541 if (!do_swap_account)
542 break;
543 ret = res_counter_charge(&mem->memsw, PAGE_SIZE);
544 if (likely(!ret))
545 break;
546 /* mem+swap counter fails */
547 res_counter_uncharge(&mem->res, PAGE_SIZE);
548 noswap = true;
549 }
518 if (!(gfp_mask & __GFP_WAIT)) 550 if (!(gfp_mask & __GFP_WAIT))
519 goto nomem; 551 goto nomem;
520 552
521 if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) 553 if (try_to_free_mem_cgroup_pages(mem, gfp_mask, noswap))
522 continue; 554 continue;
523 555
524 /* 556 /*
@@ -527,8 +559,13 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
527 * moved to swap cache or just unmapped from the cgroup. 559 * moved to swap cache or just unmapped from the cgroup.
528 * Check the limit again to see if the reclaim reduced the 560 * Check the limit again to see if the reclaim reduced the
529 * current usage of the cgroup before giving up 561 * current usage of the cgroup before giving up
562 *
530 */ 563 */
531 if (res_counter_check_under_limit(&mem->res)) 564 if (!do_swap_account &&
565 res_counter_check_under_limit(&mem->res))
566 continue;
567 if (do_swap_account &&
568 res_counter_check_under_limit(&mem->memsw))
532 continue; 569 continue;
533 570
534 if (!nr_retries--) { 571 if (!nr_retries--) {
@@ -582,6 +619,8 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
582 if (unlikely(PageCgroupUsed(pc))) { 619 if (unlikely(PageCgroupUsed(pc))) {
583 unlock_page_cgroup(pc); 620 unlock_page_cgroup(pc);
584 res_counter_uncharge(&mem->res, PAGE_SIZE); 621 res_counter_uncharge(&mem->res, PAGE_SIZE);
622 if (do_swap_account)
623 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
585 css_put(&mem->css); 624 css_put(&mem->css);
586 return; 625 return;
587 } 626 }
@@ -646,6 +685,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
646 __mem_cgroup_remove_list(from_mz, pc); 685 __mem_cgroup_remove_list(from_mz, pc);
647 css_put(&from->css); 686 css_put(&from->css);
648 res_counter_uncharge(&from->res, PAGE_SIZE); 687 res_counter_uncharge(&from->res, PAGE_SIZE);
688 if (do_swap_account)
689 res_counter_uncharge(&from->memsw, PAGE_SIZE);
649 pc->mem_cgroup = to; 690 pc->mem_cgroup = to;
650 css_get(&to->css); 691 css_get(&to->css);
651 __mem_cgroup_add_list(to_mz, pc, false); 692 __mem_cgroup_add_list(to_mz, pc, false);
@@ -692,8 +733,11 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
692 /* drop extra refcnt */ 733 /* drop extra refcnt */
693 css_put(&parent->css); 734 css_put(&parent->css);
694 /* uncharge if move fails */ 735 /* uncharge if move fails */
695 if (ret) 736 if (ret) {
696 res_counter_uncharge(&parent->res, PAGE_SIZE); 737 res_counter_uncharge(&parent->res, PAGE_SIZE);
738 if (do_swap_account)
739 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
740 }
697 741
698 return ret; 742 return ret;
699} 743}
@@ -791,7 +835,42 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
791 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); 835 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
792} 836}
793 837
838int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
839 struct page *page,
840 gfp_t mask, struct mem_cgroup **ptr)
841{
842 struct mem_cgroup *mem;
843 swp_entry_t ent;
844
845 if (mem_cgroup_subsys.disabled)
846 return 0;
847
848 if (!do_swap_account)
849 goto charge_cur_mm;
850
851 /*
852 * A racing thread's fault, or swapoff, may have already updated
853 * the pte, and even removed page from swap cache: return success
854 * to go on to do_swap_page()'s pte_same() test, which should fail.
855 */
856 if (!PageSwapCache(page))
857 return 0;
858
859 ent.val = page_private(page);
860
861 mem = lookup_swap_cgroup(ent);
862 if (!mem || mem->obsolete)
863 goto charge_cur_mm;
864 *ptr = mem;
865 return __mem_cgroup_try_charge(NULL, mask, ptr, true);
866charge_cur_mm:
867 if (unlikely(!mm))
868 mm = &init_mm;
869 return __mem_cgroup_try_charge(mm, mask, ptr, true);
870}
871
794#ifdef CONFIG_SWAP 872#ifdef CONFIG_SWAP
873
795int mem_cgroup_cache_charge_swapin(struct page *page, 874int mem_cgroup_cache_charge_swapin(struct page *page,
796 struct mm_struct *mm, gfp_t mask, bool locked) 875 struct mm_struct *mm, gfp_t mask, bool locked)
797{ 876{
@@ -808,8 +887,28 @@ int mem_cgroup_cache_charge_swapin(struct page *page,
808 * we reach here. 887 * we reach here.
809 */ 888 */
810 if (PageSwapCache(page)) { 889 if (PageSwapCache(page)) {
890 struct mem_cgroup *mem = NULL;
891 swp_entry_t ent;
892
893 ent.val = page_private(page);
894 if (do_swap_account) {
895 mem = lookup_swap_cgroup(ent);
896 if (mem && mem->obsolete)
897 mem = NULL;
898 if (mem)
899 mm = NULL;
900 }
811 ret = mem_cgroup_charge_common(page, mm, mask, 901 ret = mem_cgroup_charge_common(page, mm, mask,
812 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); 902 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
903
904 if (!ret && do_swap_account) {
905 /* avoid double counting */
906 mem = swap_cgroup_record(ent, NULL);
907 if (mem) {
908 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
909 mem_cgroup_put(mem);
910 }
911 }
813 } 912 }
814 if (!locked) 913 if (!locked)
815 unlock_page(page); 914 unlock_page(page);
@@ -828,6 +927,23 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
828 return; 927 return;
829 pc = lookup_page_cgroup(page); 928 pc = lookup_page_cgroup(page);
830 __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); 929 __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
930 /*
931 * Now swap is on-memory. This means this page may be
932 * counted both as mem and swap....double count.
933 * Fix it by uncharging from memsw. This SwapCache is stable
934 * because we're still under lock_page().
935 */
936 if (do_swap_account) {
937 swp_entry_t ent = {.val = page_private(page)};
938 struct mem_cgroup *memcg;
939 memcg = swap_cgroup_record(ent, NULL);
940 if (memcg) {
941 /* If memcg is obsolete, memcg can be != ptr */
942 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
943 mem_cgroup_put(memcg);
944 }
945
946 }
831} 947}
832 948
833void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 949void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
@@ -837,6 +953,8 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
837 if (!mem) 953 if (!mem)
838 return; 954 return;
839 res_counter_uncharge(&mem->res, PAGE_SIZE); 955 res_counter_uncharge(&mem->res, PAGE_SIZE);
956 if (do_swap_account)
957 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
840 css_put(&mem->css); 958 css_put(&mem->css);
841} 959}
842 960
@@ -844,29 +962,31 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
844/* 962/*
845 * uncharge if !page_mapped(page) 963 * uncharge if !page_mapped(page)
846 */ 964 */
847static void 965static struct mem_cgroup *
848__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 966__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
849{ 967{
850 struct page_cgroup *pc; 968 struct page_cgroup *pc;
851 struct mem_cgroup *mem; 969 struct mem_cgroup *mem = NULL;
852 struct mem_cgroup_per_zone *mz; 970 struct mem_cgroup_per_zone *mz;
853 unsigned long flags; 971 unsigned long flags;
854 972
855 if (mem_cgroup_subsys.disabled) 973 if (mem_cgroup_subsys.disabled)
856 return; 974 return NULL;
857 975
858 if (PageSwapCache(page)) 976 if (PageSwapCache(page))
859 return; 977 return NULL;
860 978
861 /* 979 /*
862 * Check if our page_cgroup is valid 980 * Check if our page_cgroup is valid
863 */ 981 */
864 pc = lookup_page_cgroup(page); 982 pc = lookup_page_cgroup(page);
865 if (unlikely(!pc || !PageCgroupUsed(pc))) 983 if (unlikely(!pc || !PageCgroupUsed(pc)))
866 return; 984 return NULL;
867 985
868 lock_page_cgroup(pc); 986 lock_page_cgroup(pc);
869 987
988 mem = pc->mem_cgroup;
989
870 if (!PageCgroupUsed(pc)) 990 if (!PageCgroupUsed(pc))
871 goto unlock_out; 991 goto unlock_out;
872 992
@@ -886,8 +1006,11 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
886 break; 1006 break;
887 } 1007 }
888 1008
1009 res_counter_uncharge(&mem->res, PAGE_SIZE);
1010 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1011 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1012
889 ClearPageCgroupUsed(pc); 1013 ClearPageCgroupUsed(pc);
890 mem = pc->mem_cgroup;
891 1014
892 mz = page_cgroup_zoneinfo(pc); 1015 mz = page_cgroup_zoneinfo(pc);
893 spin_lock_irqsave(&mz->lru_lock, flags); 1016 spin_lock_irqsave(&mz->lru_lock, flags);
@@ -895,14 +1018,13 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
895 spin_unlock_irqrestore(&mz->lru_lock, flags); 1018 spin_unlock_irqrestore(&mz->lru_lock, flags);
896 unlock_page_cgroup(pc); 1019 unlock_page_cgroup(pc);
897 1020
898 res_counter_uncharge(&mem->res, PAGE_SIZE);
899 css_put(&mem->css); 1021 css_put(&mem->css);
900 1022
901 return; 1023 return mem;
902 1024
903unlock_out: 1025unlock_out:
904 unlock_page_cgroup(pc); 1026 unlock_page_cgroup(pc);
905 return; 1027 return NULL;
906} 1028}
907 1029
908void mem_cgroup_uncharge_page(struct page *page) 1030void mem_cgroup_uncharge_page(struct page *page)
@@ -922,10 +1044,42 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
922 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 1044 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
923} 1045}
924 1046
925void mem_cgroup_uncharge_swapcache(struct page *page) 1047/*
1048 * called from __delete_from_swap_cache() and drop "page" account.
1049 * memcg information is recorded to swap_cgroup of "ent"
1050 */
1051void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1052{
1053 struct mem_cgroup *memcg;
1054
1055 memcg = __mem_cgroup_uncharge_common(page,
1056 MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
1057 /* record memcg information */
1058 if (do_swap_account && memcg) {
1059 swap_cgroup_record(ent, memcg);
1060 mem_cgroup_get(memcg);
1061 }
1062}
1063
1064#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1065/*
1066 * called from swap_entry_free(). remove record in swap_cgroup and
1067 * uncharge "memsw" account.
1068 */
1069void mem_cgroup_uncharge_swap(swp_entry_t ent)
926{ 1070{
927 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_SWAPOUT); 1071 struct mem_cgroup *memcg;
1072
1073 if (!do_swap_account)
1074 return;
1075
1076 memcg = swap_cgroup_record(ent, NULL);
1077 if (memcg) {
1078 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1079 mem_cgroup_put(memcg);
1080 }
928} 1081}
1082#endif
929 1083
930/* 1084/*
931 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 1085 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
@@ -1034,7 +1188,7 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
1034 rcu_read_unlock(); 1188 rcu_read_unlock();
1035 1189
1036 do { 1190 do {
1037 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); 1191 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true);
1038 progress += res_counter_check_under_limit(&mem->res); 1192 progress += res_counter_check_under_limit(&mem->res);
1039 } while (!progress && --retry); 1193 } while (!progress && --retry);
1040 1194
@@ -1044,26 +1198,84 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
1044 return 0; 1198 return 0;
1045} 1199}
1046 1200
1201static DEFINE_MUTEX(set_limit_mutex);
1202
1047static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 1203static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1048 unsigned long long val) 1204 unsigned long long val)
1049{ 1205{
1050 1206
1051 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 1207 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1052 int progress; 1208 int progress;
1209 u64 memswlimit;
1053 int ret = 0; 1210 int ret = 0;
1054 1211
1055 while (res_counter_set_limit(&memcg->res, val)) { 1212 while (retry_count) {
1056 if (signal_pending(current)) { 1213 if (signal_pending(current)) {
1057 ret = -EINTR; 1214 ret = -EINTR;
1058 break; 1215 break;
1059 } 1216 }
1060 if (!retry_count) { 1217 /*
1061 ret = -EBUSY; 1218 * Rather than hide all in some function, I do this in
1219 * open coded manner. You see what this really does.
1220 * We have to guarantee mem->res.limit < mem->memsw.limit.
1221 */
1222 mutex_lock(&set_limit_mutex);
1223 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1224 if (memswlimit < val) {
1225 ret = -EINVAL;
1226 mutex_unlock(&set_limit_mutex);
1062 break; 1227 break;
1063 } 1228 }
1229 ret = res_counter_set_limit(&memcg->res, val);
1230 mutex_unlock(&set_limit_mutex);
1231
1232 if (!ret)
1233 break;
1234
1064 progress = try_to_free_mem_cgroup_pages(memcg, 1235 progress = try_to_free_mem_cgroup_pages(memcg,
1065 GFP_HIGHUSER_MOVABLE); 1236 GFP_HIGHUSER_MOVABLE, false);
1066 if (!progress) 1237 if (!progress) retry_count--;
1238 }
1239 return ret;
1240}
1241
1242int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1243 unsigned long long val)
1244{
1245 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1246 u64 memlimit, oldusage, curusage;
1247 int ret;
1248
1249 if (!do_swap_account)
1250 return -EINVAL;
1251
1252 while (retry_count) {
1253 if (signal_pending(current)) {
1254 ret = -EINTR;
1255 break;
1256 }
1257 /*
1258 * Rather than hide all in some function, I do this in
1259 * open coded manner. You see what this really does.
1260 * We have to guarantee mem->res.limit < mem->memsw.limit.
1261 */
1262 mutex_lock(&set_limit_mutex);
1263 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1264 if (memlimit > val) {
1265 ret = -EINVAL;
1266 mutex_unlock(&set_limit_mutex);
1267 break;
1268 }
1269 ret = res_counter_set_limit(&memcg->memsw, val);
1270 mutex_unlock(&set_limit_mutex);
1271
1272 if (!ret)
1273 break;
1274
1275 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1276 try_to_free_mem_cgroup_pages(memcg, GFP_HIGHUSER_MOVABLE, true);
1277 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1278 if (curusage >= oldusage)
1067 retry_count--; 1279 retry_count--;
1068 } 1280 }
1069 return ret; 1281 return ret;
@@ -1193,7 +1405,7 @@ try_to_free:
1193 goto out; 1405 goto out;
1194 } 1406 }
1195 progress = try_to_free_mem_cgroup_pages(mem, 1407 progress = try_to_free_mem_cgroup_pages(mem,
1196 GFP_HIGHUSER_MOVABLE); 1408 GFP_HIGHUSER_MOVABLE, false);
1197 if (!progress) { 1409 if (!progress) {
1198 nr_retries--; 1410 nr_retries--;
1199 /* maybe some writeback is necessary */ 1411 /* maybe some writeback is necessary */
@@ -1216,8 +1428,25 @@ int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
1216 1428
1217static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 1429static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
1218{ 1430{
1219 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, 1431 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1220 cft->private); 1432 u64 val = 0;
1433 int type, name;
1434
1435 type = MEMFILE_TYPE(cft->private);
1436 name = MEMFILE_ATTR(cft->private);
1437 switch (type) {
1438 case _MEM:
1439 val = res_counter_read_u64(&mem->res, name);
1440 break;
1441 case _MEMSWAP:
1442 if (do_swap_account)
1443 val = res_counter_read_u64(&mem->memsw, name);
1444 break;
1445 default:
1446 BUG();
1447 break;
1448 }
1449 return val;
1221} 1450}
1222/* 1451/*
1223 * The user of this function is... 1452 * The user of this function is...
@@ -1227,15 +1456,22 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
1227 const char *buffer) 1456 const char *buffer)
1228{ 1457{
1229 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 1458 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
1459 int type, name;
1230 unsigned long long val; 1460 unsigned long long val;
1231 int ret; 1461 int ret;
1232 1462
1233 switch (cft->private) { 1463 type = MEMFILE_TYPE(cft->private);
1464 name = MEMFILE_ATTR(cft->private);
1465 switch (name) {
1234 case RES_LIMIT: 1466 case RES_LIMIT:
1235 /* This function does all necessary parse...reuse it */ 1467 /* This function does all necessary parse...reuse it */
1236 ret = res_counter_memparse_write_strategy(buffer, &val); 1468 ret = res_counter_memparse_write_strategy(buffer, &val);
1237 if (!ret) 1469 if (ret)
1470 break;
1471 if (type == _MEM)
1238 ret = mem_cgroup_resize_limit(memcg, val); 1472 ret = mem_cgroup_resize_limit(memcg, val);
1473 else
1474 ret = mem_cgroup_resize_memsw_limit(memcg, val);
1239 break; 1475 break;
1240 default: 1476 default:
1241 ret = -EINVAL; /* should be BUG() ? */ 1477 ret = -EINVAL; /* should be BUG() ? */
@@ -1247,14 +1483,23 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
1247static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 1483static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1248{ 1484{
1249 struct mem_cgroup *mem; 1485 struct mem_cgroup *mem;
1486 int type, name;
1250 1487
1251 mem = mem_cgroup_from_cont(cont); 1488 mem = mem_cgroup_from_cont(cont);
1252 switch (event) { 1489 type = MEMFILE_TYPE(event);
1490 name = MEMFILE_ATTR(event);
1491 switch (name) {
1253 case RES_MAX_USAGE: 1492 case RES_MAX_USAGE:
1254 res_counter_reset_max(&mem->res); 1493 if (type == _MEM)
1494 res_counter_reset_max(&mem->res);
1495 else
1496 res_counter_reset_max(&mem->memsw);
1255 break; 1497 break;
1256 case RES_FAILCNT: 1498 case RES_FAILCNT:
1257 res_counter_reset_failcnt(&mem->res); 1499 if (type == _MEM)
1500 res_counter_reset_failcnt(&mem->res);
1501 else
1502 res_counter_reset_failcnt(&mem->memsw);
1258 break; 1503 break;
1259 } 1504 }
1260 return 0; 1505 return 0;
@@ -1315,24 +1560,24 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1315static struct cftype mem_cgroup_files[] = { 1560static struct cftype mem_cgroup_files[] = {
1316 { 1561 {
1317 .name = "usage_in_bytes", 1562 .name = "usage_in_bytes",
1318 .private = RES_USAGE, 1563 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
1319 .read_u64 = mem_cgroup_read, 1564 .read_u64 = mem_cgroup_read,
1320 }, 1565 },
1321 { 1566 {
1322 .name = "max_usage_in_bytes", 1567 .name = "max_usage_in_bytes",
1323 .private = RES_MAX_USAGE, 1568 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
1324 .trigger = mem_cgroup_reset, 1569 .trigger = mem_cgroup_reset,
1325 .read_u64 = mem_cgroup_read, 1570 .read_u64 = mem_cgroup_read,
1326 }, 1571 },
1327 { 1572 {
1328 .name = "limit_in_bytes", 1573 .name = "limit_in_bytes",
1329 .private = RES_LIMIT, 1574 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
1330 .write_string = mem_cgroup_write, 1575 .write_string = mem_cgroup_write,
1331 .read_u64 = mem_cgroup_read, 1576 .read_u64 = mem_cgroup_read,
1332 }, 1577 },
1333 { 1578 {
1334 .name = "failcnt", 1579 .name = "failcnt",
1335 .private = RES_FAILCNT, 1580 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
1336 .trigger = mem_cgroup_reset, 1581 .trigger = mem_cgroup_reset,
1337 .read_u64 = mem_cgroup_read, 1582 .read_u64 = mem_cgroup_read,
1338 }, 1583 },
@@ -1346,6 +1591,47 @@ static struct cftype mem_cgroup_files[] = {
1346 }, 1591 },
1347}; 1592};
1348 1593
1594#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1595static struct cftype memsw_cgroup_files[] = {
1596 {
1597 .name = "memsw.usage_in_bytes",
1598 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
1599 .read_u64 = mem_cgroup_read,
1600 },
1601 {
1602 .name = "memsw.max_usage_in_bytes",
1603 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
1604 .trigger = mem_cgroup_reset,
1605 .read_u64 = mem_cgroup_read,
1606 },
1607 {
1608 .name = "memsw.limit_in_bytes",
1609 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
1610 .write_string = mem_cgroup_write,
1611 .read_u64 = mem_cgroup_read,
1612 },
1613 {
1614 .name = "memsw.failcnt",
1615 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
1616 .trigger = mem_cgroup_reset,
1617 .read_u64 = mem_cgroup_read,
1618 },
1619};
1620
1621static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
1622{
1623 if (!do_swap_account)
1624 return 0;
1625 return cgroup_add_files(cont, ss, memsw_cgroup_files,
1626 ARRAY_SIZE(memsw_cgroup_files));
1627};
1628#else
1629static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
1630{
1631 return 0;
1632}
1633#endif
1634
1349static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 1635static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1350{ 1636{
1351 struct mem_cgroup_per_node *pn; 1637 struct mem_cgroup_per_node *pn;
@@ -1404,14 +1690,44 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
1404 return mem; 1690 return mem;
1405} 1691}
1406 1692
1693/*
1694 * At destroying mem_cgroup, references from swap_cgroup can remain.
1695 * (scanning all at force_empty is too costly...)
1696 *
1697 * Instead of clearing all references at force_empty, we remember
1698 * the number of reference from swap_cgroup and free mem_cgroup when
1699 * it goes down to 0.
1700 *
1701 * When mem_cgroup is destroyed, mem->obsolete will be set to 0 and
1702 * entry which points to this memcg will be ignore at swapin.
1703 *
1704 * Removal of cgroup itself succeeds regardless of refs from swap.
1705 */
1706
1407static void mem_cgroup_free(struct mem_cgroup *mem) 1707static void mem_cgroup_free(struct mem_cgroup *mem)
1408{ 1708{
1709 if (atomic_read(&mem->refcnt) > 0)
1710 return;
1409 if (mem_cgroup_size() < PAGE_SIZE) 1711 if (mem_cgroup_size() < PAGE_SIZE)
1410 kfree(mem); 1712 kfree(mem);
1411 else 1713 else
1412 vfree(mem); 1714 vfree(mem);
1413} 1715}
1414 1716
1717static void mem_cgroup_get(struct mem_cgroup *mem)
1718{
1719 atomic_inc(&mem->refcnt);
1720}
1721
1722static void mem_cgroup_put(struct mem_cgroup *mem)
1723{
1724 if (atomic_dec_and_test(&mem->refcnt)) {
1725 if (!mem->obsolete)
1726 return;
1727 mem_cgroup_free(mem);
1728 }
1729}
1730
1415 1731
1416#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 1732#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1417static void __init enable_swap_cgroup(void) 1733static void __init enable_swap_cgroup(void)
@@ -1436,6 +1752,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1436 return ERR_PTR(-ENOMEM); 1752 return ERR_PTR(-ENOMEM);
1437 1753
1438 res_counter_init(&mem->res); 1754 res_counter_init(&mem->res);
1755 res_counter_init(&mem->memsw);
1439 1756
1440 for_each_node_state(node, N_POSSIBLE) 1757 for_each_node_state(node, N_POSSIBLE)
1441 if (alloc_mem_cgroup_per_zone_info(mem, node)) 1758 if (alloc_mem_cgroup_per_zone_info(mem, node))
@@ -1456,6 +1773,7 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
1456 struct cgroup *cont) 1773 struct cgroup *cont)
1457{ 1774{
1458 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1775 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1776 mem->obsolete = 1;
1459 mem_cgroup_force_empty(mem, false); 1777 mem_cgroup_force_empty(mem, false);
1460} 1778}
1461 1779
@@ -1474,8 +1792,14 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1474static int mem_cgroup_populate(struct cgroup_subsys *ss, 1792static int mem_cgroup_populate(struct cgroup_subsys *ss,
1475 struct cgroup *cont) 1793 struct cgroup *cont)
1476{ 1794{
1477 return cgroup_add_files(cont, ss, mem_cgroup_files, 1795 int ret;
1478 ARRAY_SIZE(mem_cgroup_files)); 1796
1797 ret = cgroup_add_files(cont, ss, mem_cgroup_files,
1798 ARRAY_SIZE(mem_cgroup_files));
1799
1800 if (!ret)
1801 ret = register_memsw_files(cont, ss);
1802 return ret;
1479} 1803}
1480 1804
1481static void mem_cgroup_move_task(struct cgroup_subsys *ss, 1805static void mem_cgroup_move_task(struct cgroup_subsys *ss,
diff --git a/mm/memory.c b/mm/memory.c
index ba5189e322e6..1358012ffa73 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2431,7 +2431,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2431 lock_page(page); 2431 lock_page(page);
2432 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2432 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2433 2433
2434 if (mem_cgroup_try_charge(mm, GFP_HIGHUSER_MOVABLE, &ptr) == -ENOMEM) { 2434 if (mem_cgroup_try_charge_swapin(mm, page,
2435 GFP_HIGHUSER_MOVABLE, &ptr) == -ENOMEM) {
2435 ret = VM_FAULT_OOM; 2436 ret = VM_FAULT_OOM;
2436 unlock_page(page); 2437 unlock_page(page);
2437 goto out; 2438 goto out;
@@ -2449,8 +2450,20 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2449 goto out_nomap; 2450 goto out_nomap;
2450 } 2451 }
2451 2452
2452 /* The page isn't present yet, go ahead with the fault. */ 2453 /*
2454 * The page isn't present yet, go ahead with the fault.
2455 *
2456 * Be careful about the sequence of operations here.
2457 * To get its accounting right, reuse_swap_page() must be called
2458 * while the page is counted on swap but not yet in mapcount i.e.
2459 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
2460 * must be called after the swap_free(), or it will never succeed.
2461 * And mem_cgroup_commit_charge_swapin(), which uses the swp_entry
2462 * in page->private, must be called before reuse_swap_page(),
2463 * which may delete_from_swap_cache().
2464 */
2453 2465
2466 mem_cgroup_commit_charge_swapin(page, ptr);
2454 inc_mm_counter(mm, anon_rss); 2467 inc_mm_counter(mm, anon_rss);
2455 pte = mk_pte(page, vma->vm_page_prot); 2468 pte = mk_pte(page, vma->vm_page_prot);
2456 if (write_access && reuse_swap_page(page)) { 2469 if (write_access && reuse_swap_page(page)) {
@@ -2461,7 +2474,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2461 flush_icache_page(vma, page); 2474 flush_icache_page(vma, page);
2462 set_pte_at(mm, address, page_table, pte); 2475 set_pte_at(mm, address, page_table, pte);
2463 page_add_anon_rmap(page, vma, address); 2476 page_add_anon_rmap(page, vma, address);
2464 mem_cgroup_commit_charge_swapin(page, ptr);
2465 2477
2466 swap_free(entry); 2478 swap_free(entry);
2467 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 2479 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 09291ca11f5f..3ecea98ecb45 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,6 +17,7 @@
17#include <linux/backing-dev.h> 17#include <linux/backing-dev.h>
18#include <linux/pagevec.h> 18#include <linux/pagevec.h>
19#include <linux/migrate.h> 19#include <linux/migrate.h>
20#include <linux/page_cgroup.h>
20 21
21#include <asm/pgtable.h> 22#include <asm/pgtable.h>
22 23
@@ -108,6 +109,8 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
108 */ 109 */
109void __delete_from_swap_cache(struct page *page) 110void __delete_from_swap_cache(struct page *page)
110{ 111{
112 swp_entry_t ent = {.val = page_private(page)};
113
111 VM_BUG_ON(!PageLocked(page)); 114 VM_BUG_ON(!PageLocked(page));
112 VM_BUG_ON(!PageSwapCache(page)); 115 VM_BUG_ON(!PageSwapCache(page));
113 VM_BUG_ON(PageWriteback(page)); 116 VM_BUG_ON(PageWriteback(page));
@@ -118,7 +121,7 @@ void __delete_from_swap_cache(struct page *page)
118 total_swapcache_pages--; 121 total_swapcache_pages--;
119 __dec_zone_page_state(page, NR_FILE_PAGES); 122 __dec_zone_page_state(page, NR_FILE_PAGES);
120 INC_CACHE_INFO(del_total); 123 INC_CACHE_INFO(del_total);
121 mem_cgroup_uncharge_swapcache(page); 124 mem_cgroup_uncharge_swapcache(page, ent);
122} 125}
123 126
124/** 127/**
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1e7a715a3866..0579d9069b61 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -471,8 +471,9 @@ out:
471 return NULL; 471 return NULL;
472} 472}
473 473
474static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) 474static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent)
475{ 475{
476 unsigned long offset = swp_offset(ent);
476 int count = p->swap_map[offset]; 477 int count = p->swap_map[offset];
477 478
478 if (count < SWAP_MAP_MAX) { 479 if (count < SWAP_MAP_MAX) {
@@ -487,6 +488,7 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
487 swap_list.next = p - swap_info; 488 swap_list.next = p - swap_info;
488 nr_swap_pages++; 489 nr_swap_pages++;
489 p->inuse_pages--; 490 p->inuse_pages--;
491 mem_cgroup_uncharge_swap(ent);
490 } 492 }
491 } 493 }
492 return count; 494 return count;
@@ -502,7 +504,7 @@ void swap_free(swp_entry_t entry)
502 504
503 p = swap_info_get(entry); 505 p = swap_info_get(entry);
504 if (p) { 506 if (p) {
505 swap_entry_free(p, swp_offset(entry)); 507 swap_entry_free(p, entry);
506 spin_unlock(&swap_lock); 508 spin_unlock(&swap_lock);
507 } 509 }
508} 510}
@@ -582,7 +584,7 @@ int free_swap_and_cache(swp_entry_t entry)
582 584
583 p = swap_info_get(entry); 585 p = swap_info_get(entry);
584 if (p) { 586 if (p) {
585 if (swap_entry_free(p, swp_offset(entry)) == 1) { 587 if (swap_entry_free(p, entry) == 1) {
586 page = find_get_page(&swapper_space, entry.val); 588 page = find_get_page(&swapper_space, entry.val);
587 if (page && !trylock_page(page)) { 589 if (page && !trylock_page(page)) {
588 page_cache_release(page); 590 page_cache_release(page);
@@ -696,7 +698,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
696 pte_t *pte; 698 pte_t *pte;
697 int ret = 1; 699 int ret = 1;
698 700
699 if (mem_cgroup_try_charge(vma->vm_mm, GFP_HIGHUSER_MOVABLE, &ptr)) 701 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
702 GFP_HIGHUSER_MOVABLE, &ptr))
700 ret = -ENOMEM; 703 ret = -ENOMEM;
701 704
702 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 705 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b07c48b09a93..f63b20dd7714 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1661,7 +1661,8 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1661#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1661#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1662 1662
1663unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 1663unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1664 gfp_t gfp_mask) 1664 gfp_t gfp_mask,
1665 bool noswap)
1665{ 1666{
1666 struct scan_control sc = { 1667 struct scan_control sc = {
1667 .may_writepage = !laptop_mode, 1668 .may_writepage = !laptop_mode,
@@ -1674,6 +1675,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1674 }; 1675 };
1675 struct zonelist *zonelist; 1676 struct zonelist *zonelist;
1676 1677
1678 if (noswap)
1679 sc.may_swap = 0;
1680
1677 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1681 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1678 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1682 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1679 zonelist = NODE_DATA(numa_node_id())->node_zonelists; 1683 zonelist = NODE_DATA(numa_node_id())->node_zonelists;