diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 400 |
1 files changed, 362 insertions, 38 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 59dd8c116372..2efcf38f3b73 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
28 | #include <linux/bit_spinlock.h> | 28 | #include <linux/bit_spinlock.h> |
29 | #include <linux/rcupdate.h> | 29 | #include <linux/rcupdate.h> |
30 | #include <linux/mutex.h> | ||
30 | #include <linux/slab.h> | 31 | #include <linux/slab.h> |
31 | #include <linux/swap.h> | 32 | #include <linux/swap.h> |
32 | #include <linux/spinlock.h> | 33 | #include <linux/spinlock.h> |
@@ -132,12 +133,18 @@ struct mem_cgroup { | |||
132 | */ | 133 | */ |
133 | struct res_counter res; | 134 | struct res_counter res; |
134 | /* | 135 | /* |
136 | * the counter to account for mem+swap usage. | ||
137 | */ | ||
138 | struct res_counter memsw; | ||
139 | /* | ||
135 | * Per cgroup active and inactive list, similar to the | 140 | * Per cgroup active and inactive list, similar to the |
136 | * per zone LRU lists. | 141 | * per zone LRU lists. |
137 | */ | 142 | */ |
138 | struct mem_cgroup_lru_info info; | 143 | struct mem_cgroup_lru_info info; |
139 | 144 | ||
140 | int prev_priority; /* for recording reclaim priority */ | 145 | int prev_priority; /* for recording reclaim priority */ |
146 | int obsolete; | ||
147 | atomic_t refcnt; | ||
141 | /* | 148 | /* |
142 | * statistics. This must be placed at the end of memcg. | 149 | * statistics. This must be placed at the end of memcg. |
143 | */ | 150 | */ |
@@ -167,6 +174,17 @@ pcg_default_flags[NR_CHARGE_TYPE] = { | |||
167 | 0, /* FORCE */ | 174 | 0, /* FORCE */ |
168 | }; | 175 | }; |
169 | 176 | ||
177 | |||
178 | /* for encoding cft->private value on file */ | ||
179 | #define _MEM (0) | ||
180 | #define _MEMSWAP (1) | ||
181 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | ||
182 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | ||
183 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | ||
184 | |||
185 | static void mem_cgroup_get(struct mem_cgroup *mem); | ||
186 | static void mem_cgroup_put(struct mem_cgroup *mem); | ||
187 | |||
170 | /* | 188 | /* |
171 | * Always modified under lru lock. Then, not necessary to preempt_disable() | 189 | * Always modified under lru lock. Then, not necessary to preempt_disable() |
172 | */ | 190 | */ |
@@ -485,7 +503,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
485 | * oom-killer can be invoked. | 503 | * oom-killer can be invoked. |
486 | */ | 504 | */ |
487 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 505 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
488 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) | 506 | gfp_t gfp_mask, struct mem_cgroup **memcg, |
507 | bool oom) | ||
489 | { | 508 | { |
490 | struct mem_cgroup *mem; | 509 | struct mem_cgroup *mem; |
491 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 510 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
@@ -513,12 +532,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
513 | css_get(&mem->css); | 532 | css_get(&mem->css); |
514 | } | 533 | } |
515 | 534 | ||
535 | while (1) { | ||
536 | int ret; | ||
537 | bool noswap = false; | ||
516 | 538 | ||
517 | while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { | 539 | ret = res_counter_charge(&mem->res, PAGE_SIZE); |
540 | if (likely(!ret)) { | ||
541 | if (!do_swap_account) | ||
542 | break; | ||
543 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE); | ||
544 | if (likely(!ret)) | ||
545 | break; | ||
546 | /* mem+swap counter fails */ | ||
547 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
548 | noswap = true; | ||
549 | } | ||
518 | if (!(gfp_mask & __GFP_WAIT)) | 550 | if (!(gfp_mask & __GFP_WAIT)) |
519 | goto nomem; | 551 | goto nomem; |
520 | 552 | ||
521 | if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) | 553 | if (try_to_free_mem_cgroup_pages(mem, gfp_mask, noswap)) |
522 | continue; | 554 | continue; |
523 | 555 | ||
524 | /* | 556 | /* |
@@ -527,8 +559,13 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
527 | * moved to swap cache or just unmapped from the cgroup. | 559 | * moved to swap cache or just unmapped from the cgroup. |
528 | * Check the limit again to see if the reclaim reduced the | 560 | * Check the limit again to see if the reclaim reduced the |
529 | * current usage of the cgroup before giving up | 561 | * current usage of the cgroup before giving up |
562 | * | ||
530 | */ | 563 | */ |
531 | if (res_counter_check_under_limit(&mem->res)) | 564 | if (!do_swap_account && |
565 | res_counter_check_under_limit(&mem->res)) | ||
566 | continue; | ||
567 | if (do_swap_account && | ||
568 | res_counter_check_under_limit(&mem->memsw)) | ||
532 | continue; | 569 | continue; |
533 | 570 | ||
534 | if (!nr_retries--) { | 571 | if (!nr_retries--) { |
@@ -582,6 +619,8 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
582 | if (unlikely(PageCgroupUsed(pc))) { | 619 | if (unlikely(PageCgroupUsed(pc))) { |
583 | unlock_page_cgroup(pc); | 620 | unlock_page_cgroup(pc); |
584 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 621 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
622 | if (do_swap_account) | ||
623 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
585 | css_put(&mem->css); | 624 | css_put(&mem->css); |
586 | return; | 625 | return; |
587 | } | 626 | } |
@@ -646,6 +685,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
646 | __mem_cgroup_remove_list(from_mz, pc); | 685 | __mem_cgroup_remove_list(from_mz, pc); |
647 | css_put(&from->css); | 686 | css_put(&from->css); |
648 | res_counter_uncharge(&from->res, PAGE_SIZE); | 687 | res_counter_uncharge(&from->res, PAGE_SIZE); |
688 | if (do_swap_account) | ||
689 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
649 | pc->mem_cgroup = to; | 690 | pc->mem_cgroup = to; |
650 | css_get(&to->css); | 691 | css_get(&to->css); |
651 | __mem_cgroup_add_list(to_mz, pc, false); | 692 | __mem_cgroup_add_list(to_mz, pc, false); |
@@ -692,8 +733,11 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
692 | /* drop extra refcnt */ | 733 | /* drop extra refcnt */ |
693 | css_put(&parent->css); | 734 | css_put(&parent->css); |
694 | /* uncharge if move fails */ | 735 | /* uncharge if move fails */ |
695 | if (ret) | 736 | if (ret) { |
696 | res_counter_uncharge(&parent->res, PAGE_SIZE); | 737 | res_counter_uncharge(&parent->res, PAGE_SIZE); |
738 | if (do_swap_account) | ||
739 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | ||
740 | } | ||
697 | 741 | ||
698 | return ret; | 742 | return ret; |
699 | } | 743 | } |
@@ -791,7 +835,42 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
791 | MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); | 835 | MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); |
792 | } | 836 | } |
793 | 837 | ||
838 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | ||
839 | struct page *page, | ||
840 | gfp_t mask, struct mem_cgroup **ptr) | ||
841 | { | ||
842 | struct mem_cgroup *mem; | ||
843 | swp_entry_t ent; | ||
844 | |||
845 | if (mem_cgroup_subsys.disabled) | ||
846 | return 0; | ||
847 | |||
848 | if (!do_swap_account) | ||
849 | goto charge_cur_mm; | ||
850 | |||
851 | /* | ||
852 | * A racing thread's fault, or swapoff, may have already updated | ||
853 | * the pte, and even removed page from swap cache: return success | ||
854 | * to go on to do_swap_page()'s pte_same() test, which should fail. | ||
855 | */ | ||
856 | if (!PageSwapCache(page)) | ||
857 | return 0; | ||
858 | |||
859 | ent.val = page_private(page); | ||
860 | |||
861 | mem = lookup_swap_cgroup(ent); | ||
862 | if (!mem || mem->obsolete) | ||
863 | goto charge_cur_mm; | ||
864 | *ptr = mem; | ||
865 | return __mem_cgroup_try_charge(NULL, mask, ptr, true); | ||
866 | charge_cur_mm: | ||
867 | if (unlikely(!mm)) | ||
868 | mm = &init_mm; | ||
869 | return __mem_cgroup_try_charge(mm, mask, ptr, true); | ||
870 | } | ||
871 | |||
794 | #ifdef CONFIG_SWAP | 872 | #ifdef CONFIG_SWAP |
873 | |||
795 | int mem_cgroup_cache_charge_swapin(struct page *page, | 874 | int mem_cgroup_cache_charge_swapin(struct page *page, |
796 | struct mm_struct *mm, gfp_t mask, bool locked) | 875 | struct mm_struct *mm, gfp_t mask, bool locked) |
797 | { | 876 | { |
@@ -808,8 +887,28 @@ int mem_cgroup_cache_charge_swapin(struct page *page, | |||
808 | * we reach here. | 887 | * we reach here. |
809 | */ | 888 | */ |
810 | if (PageSwapCache(page)) { | 889 | if (PageSwapCache(page)) { |
890 | struct mem_cgroup *mem = NULL; | ||
891 | swp_entry_t ent; | ||
892 | |||
893 | ent.val = page_private(page); | ||
894 | if (do_swap_account) { | ||
895 | mem = lookup_swap_cgroup(ent); | ||
896 | if (mem && mem->obsolete) | ||
897 | mem = NULL; | ||
898 | if (mem) | ||
899 | mm = NULL; | ||
900 | } | ||
811 | ret = mem_cgroup_charge_common(page, mm, mask, | 901 | ret = mem_cgroup_charge_common(page, mm, mask, |
812 | MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); | 902 | MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); |
903 | |||
904 | if (!ret && do_swap_account) { | ||
905 | /* avoid double counting */ | ||
906 | mem = swap_cgroup_record(ent, NULL); | ||
907 | if (mem) { | ||
908 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
909 | mem_cgroup_put(mem); | ||
910 | } | ||
911 | } | ||
813 | } | 912 | } |
814 | if (!locked) | 913 | if (!locked) |
815 | unlock_page(page); | 914 | unlock_page(page); |
@@ -828,6 +927,23 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | |||
828 | return; | 927 | return; |
829 | pc = lookup_page_cgroup(page); | 928 | pc = lookup_page_cgroup(page); |
830 | __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); | 929 | __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); |
930 | /* | ||
931 | * Now swap is on-memory. This means this page may be | ||
932 | * counted both as mem and swap....double count. | ||
933 | * Fix it by uncharging from memsw. This SwapCache is stable | ||
934 | * because we're still under lock_page(). | ||
935 | */ | ||
936 | if (do_swap_account) { | ||
937 | swp_entry_t ent = {.val = page_private(page)}; | ||
938 | struct mem_cgroup *memcg; | ||
939 | memcg = swap_cgroup_record(ent, NULL); | ||
940 | if (memcg) { | ||
941 | /* If memcg is obsolete, memcg can be != ptr */ | ||
942 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | ||
943 | mem_cgroup_put(memcg); | ||
944 | } | ||
945 | |||
946 | } | ||
831 | } | 947 | } |
832 | 948 | ||
833 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | 949 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) |
@@ -837,6 +953,8 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
837 | if (!mem) | 953 | if (!mem) |
838 | return; | 954 | return; |
839 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 955 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
956 | if (do_swap_account) | ||
957 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
840 | css_put(&mem->css); | 958 | css_put(&mem->css); |
841 | } | 959 | } |
842 | 960 | ||
@@ -844,29 +962,31 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
844 | /* | 962 | /* |
845 | * uncharge if !page_mapped(page) | 963 | * uncharge if !page_mapped(page) |
846 | */ | 964 | */ |
847 | static void | 965 | static struct mem_cgroup * |
848 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 966 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) |
849 | { | 967 | { |
850 | struct page_cgroup *pc; | 968 | struct page_cgroup *pc; |
851 | struct mem_cgroup *mem; | 969 | struct mem_cgroup *mem = NULL; |
852 | struct mem_cgroup_per_zone *mz; | 970 | struct mem_cgroup_per_zone *mz; |
853 | unsigned long flags; | 971 | unsigned long flags; |
854 | 972 | ||
855 | if (mem_cgroup_subsys.disabled) | 973 | if (mem_cgroup_subsys.disabled) |
856 | return; | 974 | return NULL; |
857 | 975 | ||
858 | if (PageSwapCache(page)) | 976 | if (PageSwapCache(page)) |
859 | return; | 977 | return NULL; |
860 | 978 | ||
861 | /* | 979 | /* |
862 | * Check if our page_cgroup is valid | 980 | * Check if our page_cgroup is valid |
863 | */ | 981 | */ |
864 | pc = lookup_page_cgroup(page); | 982 | pc = lookup_page_cgroup(page); |
865 | if (unlikely(!pc || !PageCgroupUsed(pc))) | 983 | if (unlikely(!pc || !PageCgroupUsed(pc))) |
866 | return; | 984 | return NULL; |
867 | 985 | ||
868 | lock_page_cgroup(pc); | 986 | lock_page_cgroup(pc); |
869 | 987 | ||
988 | mem = pc->mem_cgroup; | ||
989 | |||
870 | if (!PageCgroupUsed(pc)) | 990 | if (!PageCgroupUsed(pc)) |
871 | goto unlock_out; | 991 | goto unlock_out; |
872 | 992 | ||
@@ -886,8 +1006,11 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
886 | break; | 1006 | break; |
887 | } | 1007 | } |
888 | 1008 | ||
1009 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1010 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
1011 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1012 | |||
889 | ClearPageCgroupUsed(pc); | 1013 | ClearPageCgroupUsed(pc); |
890 | mem = pc->mem_cgroup; | ||
891 | 1014 | ||
892 | mz = page_cgroup_zoneinfo(pc); | 1015 | mz = page_cgroup_zoneinfo(pc); |
893 | spin_lock_irqsave(&mz->lru_lock, flags); | 1016 | spin_lock_irqsave(&mz->lru_lock, flags); |
@@ -895,14 +1018,13 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
895 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 1018 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
896 | unlock_page_cgroup(pc); | 1019 | unlock_page_cgroup(pc); |
897 | 1020 | ||
898 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
899 | css_put(&mem->css); | 1021 | css_put(&mem->css); |
900 | 1022 | ||
901 | return; | 1023 | return mem; |
902 | 1024 | ||
903 | unlock_out: | 1025 | unlock_out: |
904 | unlock_page_cgroup(pc); | 1026 | unlock_page_cgroup(pc); |
905 | return; | 1027 | return NULL; |
906 | } | 1028 | } |
907 | 1029 | ||
908 | void mem_cgroup_uncharge_page(struct page *page) | 1030 | void mem_cgroup_uncharge_page(struct page *page) |
@@ -922,10 +1044,42 @@ void mem_cgroup_uncharge_cache_page(struct page *page) | |||
922 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 1044 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); |
923 | } | 1045 | } |
924 | 1046 | ||
925 | void mem_cgroup_uncharge_swapcache(struct page *page) | 1047 | /* |
1048 | * called from __delete_from_swap_cache() and drop "page" account. | ||
1049 | * memcg information is recorded to swap_cgroup of "ent" | ||
1050 | */ | ||
1051 | void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) | ||
1052 | { | ||
1053 | struct mem_cgroup *memcg; | ||
1054 | |||
1055 | memcg = __mem_cgroup_uncharge_common(page, | ||
1056 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT); | ||
1057 | /* record memcg information */ | ||
1058 | if (do_swap_account && memcg) { | ||
1059 | swap_cgroup_record(ent, memcg); | ||
1060 | mem_cgroup_get(memcg); | ||
1061 | } | ||
1062 | } | ||
1063 | |||
1064 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
1065 | /* | ||
1066 | * called from swap_entry_free(). remove record in swap_cgroup and | ||
1067 | * uncharge "memsw" account. | ||
1068 | */ | ||
1069 | void mem_cgroup_uncharge_swap(swp_entry_t ent) | ||
926 | { | 1070 | { |
927 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_SWAPOUT); | 1071 | struct mem_cgroup *memcg; |
1072 | |||
1073 | if (!do_swap_account) | ||
1074 | return; | ||
1075 | |||
1076 | memcg = swap_cgroup_record(ent, NULL); | ||
1077 | if (memcg) { | ||
1078 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | ||
1079 | mem_cgroup_put(memcg); | ||
1080 | } | ||
928 | } | 1081 | } |
1082 | #endif | ||
929 | 1083 | ||
930 | /* | 1084 | /* |
931 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old | 1085 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old |
@@ -1034,7 +1188,7 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) | |||
1034 | rcu_read_unlock(); | 1188 | rcu_read_unlock(); |
1035 | 1189 | ||
1036 | do { | 1190 | do { |
1037 | progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); | 1191 | progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true); |
1038 | progress += res_counter_check_under_limit(&mem->res); | 1192 | progress += res_counter_check_under_limit(&mem->res); |
1039 | } while (!progress && --retry); | 1193 | } while (!progress && --retry); |
1040 | 1194 | ||
@@ -1044,26 +1198,84 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) | |||
1044 | return 0; | 1198 | return 0; |
1045 | } | 1199 | } |
1046 | 1200 | ||
1201 | static DEFINE_MUTEX(set_limit_mutex); | ||
1202 | |||
1047 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | 1203 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, |
1048 | unsigned long long val) | 1204 | unsigned long long val) |
1049 | { | 1205 | { |
1050 | 1206 | ||
1051 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; | 1207 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; |
1052 | int progress; | 1208 | int progress; |
1209 | u64 memswlimit; | ||
1053 | int ret = 0; | 1210 | int ret = 0; |
1054 | 1211 | ||
1055 | while (res_counter_set_limit(&memcg->res, val)) { | 1212 | while (retry_count) { |
1056 | if (signal_pending(current)) { | 1213 | if (signal_pending(current)) { |
1057 | ret = -EINTR; | 1214 | ret = -EINTR; |
1058 | break; | 1215 | break; |
1059 | } | 1216 | } |
1060 | if (!retry_count) { | 1217 | /* |
1061 | ret = -EBUSY; | 1218 | * Rather than hide all in some function, I do this in |
1219 | * open coded manner. You see what this really does. | ||
1220 | * We have to guarantee mem->res.limit < mem->memsw.limit. | ||
1221 | */ | ||
1222 | mutex_lock(&set_limit_mutex); | ||
1223 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
1224 | if (memswlimit < val) { | ||
1225 | ret = -EINVAL; | ||
1226 | mutex_unlock(&set_limit_mutex); | ||
1062 | break; | 1227 | break; |
1063 | } | 1228 | } |
1229 | ret = res_counter_set_limit(&memcg->res, val); | ||
1230 | mutex_unlock(&set_limit_mutex); | ||
1231 | |||
1232 | if (!ret) | ||
1233 | break; | ||
1234 | |||
1064 | progress = try_to_free_mem_cgroup_pages(memcg, | 1235 | progress = try_to_free_mem_cgroup_pages(memcg, |
1065 | GFP_HIGHUSER_MOVABLE); | 1236 | GFP_HIGHUSER_MOVABLE, false); |
1066 | if (!progress) | 1237 | if (!progress) retry_count--; |
1238 | } | ||
1239 | return ret; | ||
1240 | } | ||
1241 | |||
1242 | int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | ||
1243 | unsigned long long val) | ||
1244 | { | ||
1245 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; | ||
1246 | u64 memlimit, oldusage, curusage; | ||
1247 | int ret; | ||
1248 | |||
1249 | if (!do_swap_account) | ||
1250 | return -EINVAL; | ||
1251 | |||
1252 | while (retry_count) { | ||
1253 | if (signal_pending(current)) { | ||
1254 | ret = -EINTR; | ||
1255 | break; | ||
1256 | } | ||
1257 | /* | ||
1258 | * Rather than hide all in some function, I do this in | ||
1259 | * open coded manner. You see what this really does. | ||
1260 | * We have to guarantee mem->res.limit < mem->memsw.limit. | ||
1261 | */ | ||
1262 | mutex_lock(&set_limit_mutex); | ||
1263 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
1264 | if (memlimit > val) { | ||
1265 | ret = -EINVAL; | ||
1266 | mutex_unlock(&set_limit_mutex); | ||
1267 | break; | ||
1268 | } | ||
1269 | ret = res_counter_set_limit(&memcg->memsw, val); | ||
1270 | mutex_unlock(&set_limit_mutex); | ||
1271 | |||
1272 | if (!ret) | ||
1273 | break; | ||
1274 | |||
1275 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | ||
1276 | try_to_free_mem_cgroup_pages(memcg, GFP_HIGHUSER_MOVABLE, true); | ||
1277 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | ||
1278 | if (curusage >= oldusage) | ||
1067 | retry_count--; | 1279 | retry_count--; |
1068 | } | 1280 | } |
1069 | return ret; | 1281 | return ret; |
@@ -1193,7 +1405,7 @@ try_to_free: | |||
1193 | goto out; | 1405 | goto out; |
1194 | } | 1406 | } |
1195 | progress = try_to_free_mem_cgroup_pages(mem, | 1407 | progress = try_to_free_mem_cgroup_pages(mem, |
1196 | GFP_HIGHUSER_MOVABLE); | 1408 | GFP_HIGHUSER_MOVABLE, false); |
1197 | if (!progress) { | 1409 | if (!progress) { |
1198 | nr_retries--; | 1410 | nr_retries--; |
1199 | /* maybe some writeback is necessary */ | 1411 | /* maybe some writeback is necessary */ |
@@ -1216,8 +1428,25 @@ int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) | |||
1216 | 1428 | ||
1217 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 1429 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
1218 | { | 1430 | { |
1219 | return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, | 1431 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
1220 | cft->private); | 1432 | u64 val = 0; |
1433 | int type, name; | ||
1434 | |||
1435 | type = MEMFILE_TYPE(cft->private); | ||
1436 | name = MEMFILE_ATTR(cft->private); | ||
1437 | switch (type) { | ||
1438 | case _MEM: | ||
1439 | val = res_counter_read_u64(&mem->res, name); | ||
1440 | break; | ||
1441 | case _MEMSWAP: | ||
1442 | if (do_swap_account) | ||
1443 | val = res_counter_read_u64(&mem->memsw, name); | ||
1444 | break; | ||
1445 | default: | ||
1446 | BUG(); | ||
1447 | break; | ||
1448 | } | ||
1449 | return val; | ||
1221 | } | 1450 | } |
1222 | /* | 1451 | /* |
1223 | * The user of this function is... | 1452 | * The user of this function is... |
@@ -1227,15 +1456,22 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
1227 | const char *buffer) | 1456 | const char *buffer) |
1228 | { | 1457 | { |
1229 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 1458 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
1459 | int type, name; | ||
1230 | unsigned long long val; | 1460 | unsigned long long val; |
1231 | int ret; | 1461 | int ret; |
1232 | 1462 | ||
1233 | switch (cft->private) { | 1463 | type = MEMFILE_TYPE(cft->private); |
1464 | name = MEMFILE_ATTR(cft->private); | ||
1465 | switch (name) { | ||
1234 | case RES_LIMIT: | 1466 | case RES_LIMIT: |
1235 | /* This function does all necessary parse...reuse it */ | 1467 | /* This function does all necessary parse...reuse it */ |
1236 | ret = res_counter_memparse_write_strategy(buffer, &val); | 1468 | ret = res_counter_memparse_write_strategy(buffer, &val); |
1237 | if (!ret) | 1469 | if (ret) |
1470 | break; | ||
1471 | if (type == _MEM) | ||
1238 | ret = mem_cgroup_resize_limit(memcg, val); | 1472 | ret = mem_cgroup_resize_limit(memcg, val); |
1473 | else | ||
1474 | ret = mem_cgroup_resize_memsw_limit(memcg, val); | ||
1239 | break; | 1475 | break; |
1240 | default: | 1476 | default: |
1241 | ret = -EINVAL; /* should be BUG() ? */ | 1477 | ret = -EINVAL; /* should be BUG() ? */ |
@@ -1247,14 +1483,23 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
1247 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | 1483 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) |
1248 | { | 1484 | { |
1249 | struct mem_cgroup *mem; | 1485 | struct mem_cgroup *mem; |
1486 | int type, name; | ||
1250 | 1487 | ||
1251 | mem = mem_cgroup_from_cont(cont); | 1488 | mem = mem_cgroup_from_cont(cont); |
1252 | switch (event) { | 1489 | type = MEMFILE_TYPE(event); |
1490 | name = MEMFILE_ATTR(event); | ||
1491 | switch (name) { | ||
1253 | case RES_MAX_USAGE: | 1492 | case RES_MAX_USAGE: |
1254 | res_counter_reset_max(&mem->res); | 1493 | if (type == _MEM) |
1494 | res_counter_reset_max(&mem->res); | ||
1495 | else | ||
1496 | res_counter_reset_max(&mem->memsw); | ||
1255 | break; | 1497 | break; |
1256 | case RES_FAILCNT: | 1498 | case RES_FAILCNT: |
1257 | res_counter_reset_failcnt(&mem->res); | 1499 | if (type == _MEM) |
1500 | res_counter_reset_failcnt(&mem->res); | ||
1501 | else | ||
1502 | res_counter_reset_failcnt(&mem->memsw); | ||
1258 | break; | 1503 | break; |
1259 | } | 1504 | } |
1260 | return 0; | 1505 | return 0; |
@@ -1315,24 +1560,24 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
1315 | static struct cftype mem_cgroup_files[] = { | 1560 | static struct cftype mem_cgroup_files[] = { |
1316 | { | 1561 | { |
1317 | .name = "usage_in_bytes", | 1562 | .name = "usage_in_bytes", |
1318 | .private = RES_USAGE, | 1563 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
1319 | .read_u64 = mem_cgroup_read, | 1564 | .read_u64 = mem_cgroup_read, |
1320 | }, | 1565 | }, |
1321 | { | 1566 | { |
1322 | .name = "max_usage_in_bytes", | 1567 | .name = "max_usage_in_bytes", |
1323 | .private = RES_MAX_USAGE, | 1568 | .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), |
1324 | .trigger = mem_cgroup_reset, | 1569 | .trigger = mem_cgroup_reset, |
1325 | .read_u64 = mem_cgroup_read, | 1570 | .read_u64 = mem_cgroup_read, |
1326 | }, | 1571 | }, |
1327 | { | 1572 | { |
1328 | .name = "limit_in_bytes", | 1573 | .name = "limit_in_bytes", |
1329 | .private = RES_LIMIT, | 1574 | .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), |
1330 | .write_string = mem_cgroup_write, | 1575 | .write_string = mem_cgroup_write, |
1331 | .read_u64 = mem_cgroup_read, | 1576 | .read_u64 = mem_cgroup_read, |
1332 | }, | 1577 | }, |
1333 | { | 1578 | { |
1334 | .name = "failcnt", | 1579 | .name = "failcnt", |
1335 | .private = RES_FAILCNT, | 1580 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), |
1336 | .trigger = mem_cgroup_reset, | 1581 | .trigger = mem_cgroup_reset, |
1337 | .read_u64 = mem_cgroup_read, | 1582 | .read_u64 = mem_cgroup_read, |
1338 | }, | 1583 | }, |
@@ -1346,6 +1591,47 @@ static struct cftype mem_cgroup_files[] = { | |||
1346 | }, | 1591 | }, |
1347 | }; | 1592 | }; |
1348 | 1593 | ||
1594 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
1595 | static struct cftype memsw_cgroup_files[] = { | ||
1596 | { | ||
1597 | .name = "memsw.usage_in_bytes", | ||
1598 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | ||
1599 | .read_u64 = mem_cgroup_read, | ||
1600 | }, | ||
1601 | { | ||
1602 | .name = "memsw.max_usage_in_bytes", | ||
1603 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | ||
1604 | .trigger = mem_cgroup_reset, | ||
1605 | .read_u64 = mem_cgroup_read, | ||
1606 | }, | ||
1607 | { | ||
1608 | .name = "memsw.limit_in_bytes", | ||
1609 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | ||
1610 | .write_string = mem_cgroup_write, | ||
1611 | .read_u64 = mem_cgroup_read, | ||
1612 | }, | ||
1613 | { | ||
1614 | .name = "memsw.failcnt", | ||
1615 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | ||
1616 | .trigger = mem_cgroup_reset, | ||
1617 | .read_u64 = mem_cgroup_read, | ||
1618 | }, | ||
1619 | }; | ||
1620 | |||
1621 | static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) | ||
1622 | { | ||
1623 | if (!do_swap_account) | ||
1624 | return 0; | ||
1625 | return cgroup_add_files(cont, ss, memsw_cgroup_files, | ||
1626 | ARRAY_SIZE(memsw_cgroup_files)); | ||
1627 | }; | ||
1628 | #else | ||
1629 | static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) | ||
1630 | { | ||
1631 | return 0; | ||
1632 | } | ||
1633 | #endif | ||
1634 | |||
1349 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | 1635 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) |
1350 | { | 1636 | { |
1351 | struct mem_cgroup_per_node *pn; | 1637 | struct mem_cgroup_per_node *pn; |
@@ -1404,14 +1690,44 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
1404 | return mem; | 1690 | return mem; |
1405 | } | 1691 | } |
1406 | 1692 | ||
1693 | /* | ||
1694 | * At destroying mem_cgroup, references from swap_cgroup can remain. | ||
1695 | * (scanning all at force_empty is too costly...) | ||
1696 | * | ||
1697 | * Instead of clearing all references at force_empty, we remember | ||
1698 | * the number of reference from swap_cgroup and free mem_cgroup when | ||
1699 | * it goes down to 0. | ||
1700 | * | ||
1701 | * When mem_cgroup is destroyed, mem->obsolete will be set to 0 and | ||
1702 | * entry which points to this memcg will be ignore at swapin. | ||
1703 | * | ||
1704 | * Removal of cgroup itself succeeds regardless of refs from swap. | ||
1705 | */ | ||
1706 | |||
1407 | static void mem_cgroup_free(struct mem_cgroup *mem) | 1707 | static void mem_cgroup_free(struct mem_cgroup *mem) |
1408 | { | 1708 | { |
1709 | if (atomic_read(&mem->refcnt) > 0) | ||
1710 | return; | ||
1409 | if (mem_cgroup_size() < PAGE_SIZE) | 1711 | if (mem_cgroup_size() < PAGE_SIZE) |
1410 | kfree(mem); | 1712 | kfree(mem); |
1411 | else | 1713 | else |
1412 | vfree(mem); | 1714 | vfree(mem); |
1413 | } | 1715 | } |
1414 | 1716 | ||
1717 | static void mem_cgroup_get(struct mem_cgroup *mem) | ||
1718 | { | ||
1719 | atomic_inc(&mem->refcnt); | ||
1720 | } | ||
1721 | |||
1722 | static void mem_cgroup_put(struct mem_cgroup *mem) | ||
1723 | { | ||
1724 | if (atomic_dec_and_test(&mem->refcnt)) { | ||
1725 | if (!mem->obsolete) | ||
1726 | return; | ||
1727 | mem_cgroup_free(mem); | ||
1728 | } | ||
1729 | } | ||
1730 | |||
1415 | 1731 | ||
1416 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 1732 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
1417 | static void __init enable_swap_cgroup(void) | 1733 | static void __init enable_swap_cgroup(void) |
@@ -1436,6 +1752,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1436 | return ERR_PTR(-ENOMEM); | 1752 | return ERR_PTR(-ENOMEM); |
1437 | 1753 | ||
1438 | res_counter_init(&mem->res); | 1754 | res_counter_init(&mem->res); |
1755 | res_counter_init(&mem->memsw); | ||
1439 | 1756 | ||
1440 | for_each_node_state(node, N_POSSIBLE) | 1757 | for_each_node_state(node, N_POSSIBLE) |
1441 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | 1758 | if (alloc_mem_cgroup_per_zone_info(mem, node)) |
@@ -1456,6 +1773,7 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, | |||
1456 | struct cgroup *cont) | 1773 | struct cgroup *cont) |
1457 | { | 1774 | { |
1458 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 1775 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
1776 | mem->obsolete = 1; | ||
1459 | mem_cgroup_force_empty(mem, false); | 1777 | mem_cgroup_force_empty(mem, false); |
1460 | } | 1778 | } |
1461 | 1779 | ||
@@ -1474,8 +1792,14 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, | |||
1474 | static int mem_cgroup_populate(struct cgroup_subsys *ss, | 1792 | static int mem_cgroup_populate(struct cgroup_subsys *ss, |
1475 | struct cgroup *cont) | 1793 | struct cgroup *cont) |
1476 | { | 1794 | { |
1477 | return cgroup_add_files(cont, ss, mem_cgroup_files, | 1795 | int ret; |
1478 | ARRAY_SIZE(mem_cgroup_files)); | 1796 | |
1797 | ret = cgroup_add_files(cont, ss, mem_cgroup_files, | ||
1798 | ARRAY_SIZE(mem_cgroup_files)); | ||
1799 | |||
1800 | if (!ret) | ||
1801 | ret = register_memsw_files(cont, ss); | ||
1802 | return ret; | ||
1479 | } | 1803 | } |
1480 | 1804 | ||
1481 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 1805 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |