diff options
-rw-r--r-- | Documentation/controllers/memory.txt | 29 | ||||
-rw-r--r-- | include/linux/memcontrol.h | 11 | ||||
-rw-r--r-- | include/linux/swap.h | 14 | ||||
-rw-r--r-- | mm/memcontrol.c | 400 | ||||
-rw-r--r-- | mm/memory.c | 18 | ||||
-rw-r--r-- | mm/swap_state.c | 5 | ||||
-rw-r--r-- | mm/swapfile.c | 11 | ||||
-rw-r--r-- | mm/vmscan.c | 6 |
8 files changed, 440 insertions, 54 deletions
diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt index 9fe2d0eabe05..05fe29ab1e58 100644 --- a/Documentation/controllers/memory.txt +++ b/Documentation/controllers/memory.txt | |||
@@ -137,12 +137,32 @@ behind this approach is that a cgroup that aggressively uses a shared | |||
137 | page will eventually get charged for it (once it is uncharged from | 137 | page will eventually get charged for it (once it is uncharged from |
138 | the cgroup that brought it in -- this will happen on memory pressure). | 138 | the cgroup that brought it in -- this will happen on memory pressure). |
139 | 139 | ||
140 | Exception: When you do swapoff and make swapped-out pages of shmem(tmpfs) to | 140 | Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used.. |
141 | When you do swapoff and make swapped-out pages of shmem(tmpfs) to | ||
141 | be backed into memory in force, charges for pages are accounted against the | 142 | be backed into memory in force, charges for pages are accounted against the |
142 | caller of swapoff rather than the users of shmem. | 143 | caller of swapoff rather than the users of shmem. |
143 | 144 | ||
144 | 145 | ||
145 | 2.4 Reclaim | 146 | 2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) |
147 | Swap Extension allows you to record charge for swap. A swapped-in page is | ||
148 | charged back to original page allocator if possible. | ||
149 | |||
150 | When swap is accounted, following files are added. | ||
151 | - memory.memsw.usage_in_bytes. | ||
152 | - memory.memsw.limit_in_bytes. | ||
153 | |||
154 | usage of mem+swap is limited by memsw.limit_in_bytes. | ||
155 | |||
156 | Note: why 'mem+swap' rather than swap. | ||
157 | The global LRU(kswapd) can swap out arbitrary pages. Swap-out means | ||
158 | to move account from memory to swap...there is no change in usage of | ||
159 | mem+swap. | ||
160 | |||
161 | In other words, when we want to limit the usage of swap without affecting | ||
162 | global LRU, mem+swap limit is better than just limiting swap from OS point | ||
163 | of view. | ||
164 | |||
165 | 2.5 Reclaim | ||
146 | 166 | ||
147 | Each cgroup maintains a per cgroup LRU that consists of an active | 167 | Each cgroup maintains a per cgroup LRU that consists of an active |
148 | and inactive list. When a cgroup goes over its limit, we first try | 168 | and inactive list. When a cgroup goes over its limit, we first try |
@@ -246,6 +266,11 @@ Such charges are freed(at default) or moved to its parent. When moved, | |||
246 | both of RSS and CACHES are moved to parent. | 266 | both of RSS and CACHES are moved to parent. |
247 | If both of them are busy, rmdir() returns -EBUSY. See 5.1 Also. | 267 | If both of them are busy, rmdir() returns -EBUSY. See 5.1 Also. |
248 | 268 | ||
269 | Charges recorded in swap information is not updated at removal of cgroup. | ||
270 | Recorded information is discarded and a cgroup which uses swap (swapcache) | ||
271 | will be charged as a new owner of it. | ||
272 | |||
273 | |||
249 | 5. Misc. interfaces. | 274 | 5. Misc. interfaces. |
250 | 275 | ||
251 | 5.1 force_empty | 276 | 5.1 force_empty |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 41b46cc9d1f1..ca51ac72d6c0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -32,6 +32,8 @@ extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, | |||
32 | /* for swap handling */ | 32 | /* for swap handling */ |
33 | extern int mem_cgroup_try_charge(struct mm_struct *mm, | 33 | extern int mem_cgroup_try_charge(struct mm_struct *mm, |
34 | gfp_t gfp_mask, struct mem_cgroup **ptr); | 34 | gfp_t gfp_mask, struct mem_cgroup **ptr); |
35 | extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | ||
36 | struct page *page, gfp_t mask, struct mem_cgroup **ptr); | ||
35 | extern void mem_cgroup_commit_charge_swapin(struct page *page, | 37 | extern void mem_cgroup_commit_charge_swapin(struct page *page, |
36 | struct mem_cgroup *ptr); | 38 | struct mem_cgroup *ptr); |
37 | extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr); | 39 | extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr); |
@@ -80,7 +82,6 @@ extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, | |||
80 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 82 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
81 | extern int do_swap_account; | 83 | extern int do_swap_account; |
82 | #endif | 84 | #endif |
83 | |||
84 | #else /* CONFIG_CGROUP_MEM_RES_CTLR */ | 85 | #else /* CONFIG_CGROUP_MEM_RES_CTLR */ |
85 | struct mem_cgroup; | 86 | struct mem_cgroup; |
86 | 87 | ||
@@ -97,7 +98,13 @@ static inline int mem_cgroup_cache_charge(struct page *page, | |||
97 | } | 98 | } |
98 | 99 | ||
99 | static inline int mem_cgroup_try_charge(struct mm_struct *mm, | 100 | static inline int mem_cgroup_try_charge(struct mm_struct *mm, |
100 | gfp_t gfp_mask, struct mem_cgroup **ptr) | 101 | gfp_t gfp_mask, struct mem_cgroup **ptr) |
102 | { | ||
103 | return 0; | ||
104 | } | ||
105 | |||
106 | static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | ||
107 | struct page *page, gfp_t gfp_mask, struct mem_cgroup **ptr) | ||
101 | { | 108 | { |
102 | return 0; | 109 | return 0; |
103 | } | 110 | } |
diff --git a/include/linux/swap.h b/include/linux/swap.h index f8f3907533f0..be938ce4895a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -214,7 +214,7 @@ static inline void lru_cache_add_active_file(struct page *page) | |||
214 | extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | 214 | extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, |
215 | gfp_t gfp_mask); | 215 | gfp_t gfp_mask); |
216 | extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, | 216 | extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, |
217 | gfp_t gfp_mask); | 217 | gfp_t gfp_mask, bool noswap); |
218 | extern int __isolate_lru_page(struct page *page, int mode, int file); | 218 | extern int __isolate_lru_page(struct page *page, int mode, int file); |
219 | extern unsigned long shrink_all_memory(unsigned long nr_pages); | 219 | extern unsigned long shrink_all_memory(unsigned long nr_pages); |
220 | extern int vm_swappiness; | 220 | extern int vm_swappiness; |
@@ -336,7 +336,7 @@ static inline void disable_swap_token(void) | |||
336 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 336 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
337 | extern int mem_cgroup_cache_charge_swapin(struct page *page, | 337 | extern int mem_cgroup_cache_charge_swapin(struct page *page, |
338 | struct mm_struct *mm, gfp_t mask, bool locked); | 338 | struct mm_struct *mm, gfp_t mask, bool locked); |
339 | extern void mem_cgroup_uncharge_swapcache(struct page *page); | 339 | extern void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent); |
340 | #else | 340 | #else |
341 | static inline | 341 | static inline |
342 | int mem_cgroup_cache_charge_swapin(struct page *page, | 342 | int mem_cgroup_cache_charge_swapin(struct page *page, |
@@ -344,7 +344,15 @@ int mem_cgroup_cache_charge_swapin(struct page *page, | |||
344 | { | 344 | { |
345 | return 0; | 345 | return 0; |
346 | } | 346 | } |
347 | static inline void mem_cgroup_uncharge_swapcache(struct page *page) | 347 | static inline void |
348 | mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) | ||
349 | { | ||
350 | } | ||
351 | #endif | ||
352 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
353 | extern void mem_cgroup_uncharge_swap(swp_entry_t ent); | ||
354 | #else | ||
355 | static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) | ||
348 | { | 356 | { |
349 | } | 357 | } |
350 | #endif | 358 | #endif |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 59dd8c116372..2efcf38f3b73 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
28 | #include <linux/bit_spinlock.h> | 28 | #include <linux/bit_spinlock.h> |
29 | #include <linux/rcupdate.h> | 29 | #include <linux/rcupdate.h> |
30 | #include <linux/mutex.h> | ||
30 | #include <linux/slab.h> | 31 | #include <linux/slab.h> |
31 | #include <linux/swap.h> | 32 | #include <linux/swap.h> |
32 | #include <linux/spinlock.h> | 33 | #include <linux/spinlock.h> |
@@ -132,12 +133,18 @@ struct mem_cgroup { | |||
132 | */ | 133 | */ |
133 | struct res_counter res; | 134 | struct res_counter res; |
134 | /* | 135 | /* |
136 | * the counter to account for mem+swap usage. | ||
137 | */ | ||
138 | struct res_counter memsw; | ||
139 | /* | ||
135 | * Per cgroup active and inactive list, similar to the | 140 | * Per cgroup active and inactive list, similar to the |
136 | * per zone LRU lists. | 141 | * per zone LRU lists. |
137 | */ | 142 | */ |
138 | struct mem_cgroup_lru_info info; | 143 | struct mem_cgroup_lru_info info; |
139 | 144 | ||
140 | int prev_priority; /* for recording reclaim priority */ | 145 | int prev_priority; /* for recording reclaim priority */ |
146 | int obsolete; | ||
147 | atomic_t refcnt; | ||
141 | /* | 148 | /* |
142 | * statistics. This must be placed at the end of memcg. | 149 | * statistics. This must be placed at the end of memcg. |
143 | */ | 150 | */ |
@@ -167,6 +174,17 @@ pcg_default_flags[NR_CHARGE_TYPE] = { | |||
167 | 0, /* FORCE */ | 174 | 0, /* FORCE */ |
168 | }; | 175 | }; |
169 | 176 | ||
177 | |||
178 | /* for encoding cft->private value on file */ | ||
179 | #define _MEM (0) | ||
180 | #define _MEMSWAP (1) | ||
181 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | ||
182 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | ||
183 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | ||
184 | |||
185 | static void mem_cgroup_get(struct mem_cgroup *mem); | ||
186 | static void mem_cgroup_put(struct mem_cgroup *mem); | ||
187 | |||
170 | /* | 188 | /* |
171 | * Always modified under lru lock. Then, not necessary to preempt_disable() | 189 | * Always modified under lru lock. Then, not necessary to preempt_disable() |
172 | */ | 190 | */ |
@@ -485,7 +503,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
485 | * oom-killer can be invoked. | 503 | * oom-killer can be invoked. |
486 | */ | 504 | */ |
487 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 505 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
488 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) | 506 | gfp_t gfp_mask, struct mem_cgroup **memcg, |
507 | bool oom) | ||
489 | { | 508 | { |
490 | struct mem_cgroup *mem; | 509 | struct mem_cgroup *mem; |
491 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 510 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
@@ -513,12 +532,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
513 | css_get(&mem->css); | 532 | css_get(&mem->css); |
514 | } | 533 | } |
515 | 534 | ||
535 | while (1) { | ||
536 | int ret; | ||
537 | bool noswap = false; | ||
516 | 538 | ||
517 | while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { | 539 | ret = res_counter_charge(&mem->res, PAGE_SIZE); |
540 | if (likely(!ret)) { | ||
541 | if (!do_swap_account) | ||
542 | break; | ||
543 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE); | ||
544 | if (likely(!ret)) | ||
545 | break; | ||
546 | /* mem+swap counter fails */ | ||
547 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
548 | noswap = true; | ||
549 | } | ||
518 | if (!(gfp_mask & __GFP_WAIT)) | 550 | if (!(gfp_mask & __GFP_WAIT)) |
519 | goto nomem; | 551 | goto nomem; |
520 | 552 | ||
521 | if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) | 553 | if (try_to_free_mem_cgroup_pages(mem, gfp_mask, noswap)) |
522 | continue; | 554 | continue; |
523 | 555 | ||
524 | /* | 556 | /* |
@@ -527,8 +559,13 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
527 | * moved to swap cache or just unmapped from the cgroup. | 559 | * moved to swap cache or just unmapped from the cgroup. |
528 | * Check the limit again to see if the reclaim reduced the | 560 | * Check the limit again to see if the reclaim reduced the |
529 | * current usage of the cgroup before giving up | 561 | * current usage of the cgroup before giving up |
562 | * | ||
530 | */ | 563 | */ |
531 | if (res_counter_check_under_limit(&mem->res)) | 564 | if (!do_swap_account && |
565 | res_counter_check_under_limit(&mem->res)) | ||
566 | continue; | ||
567 | if (do_swap_account && | ||
568 | res_counter_check_under_limit(&mem->memsw)) | ||
532 | continue; | 569 | continue; |
533 | 570 | ||
534 | if (!nr_retries--) { | 571 | if (!nr_retries--) { |
@@ -582,6 +619,8 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
582 | if (unlikely(PageCgroupUsed(pc))) { | 619 | if (unlikely(PageCgroupUsed(pc))) { |
583 | unlock_page_cgroup(pc); | 620 | unlock_page_cgroup(pc); |
584 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 621 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
622 | if (do_swap_account) | ||
623 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
585 | css_put(&mem->css); | 624 | css_put(&mem->css); |
586 | return; | 625 | return; |
587 | } | 626 | } |
@@ -646,6 +685,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
646 | __mem_cgroup_remove_list(from_mz, pc); | 685 | __mem_cgroup_remove_list(from_mz, pc); |
647 | css_put(&from->css); | 686 | css_put(&from->css); |
648 | res_counter_uncharge(&from->res, PAGE_SIZE); | 687 | res_counter_uncharge(&from->res, PAGE_SIZE); |
688 | if (do_swap_account) | ||
689 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
649 | pc->mem_cgroup = to; | 690 | pc->mem_cgroup = to; |
650 | css_get(&to->css); | 691 | css_get(&to->css); |
651 | __mem_cgroup_add_list(to_mz, pc, false); | 692 | __mem_cgroup_add_list(to_mz, pc, false); |
@@ -692,8 +733,11 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
692 | /* drop extra refcnt */ | 733 | /* drop extra refcnt */ |
693 | css_put(&parent->css); | 734 | css_put(&parent->css); |
694 | /* uncharge if move fails */ | 735 | /* uncharge if move fails */ |
695 | if (ret) | 736 | if (ret) { |
696 | res_counter_uncharge(&parent->res, PAGE_SIZE); | 737 | res_counter_uncharge(&parent->res, PAGE_SIZE); |
738 | if (do_swap_account) | ||
739 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | ||
740 | } | ||
697 | 741 | ||
698 | return ret; | 742 | return ret; |
699 | } | 743 | } |
@@ -791,7 +835,42 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
791 | MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); | 835 | MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); |
792 | } | 836 | } |
793 | 837 | ||
838 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | ||
839 | struct page *page, | ||
840 | gfp_t mask, struct mem_cgroup **ptr) | ||
841 | { | ||
842 | struct mem_cgroup *mem; | ||
843 | swp_entry_t ent; | ||
844 | |||
845 | if (mem_cgroup_subsys.disabled) | ||
846 | return 0; | ||
847 | |||
848 | if (!do_swap_account) | ||
849 | goto charge_cur_mm; | ||
850 | |||
851 | /* | ||
852 | * A racing thread's fault, or swapoff, may have already updated | ||
853 | * the pte, and even removed page from swap cache: return success | ||
854 | * to go on to do_swap_page()'s pte_same() test, which should fail. | ||
855 | */ | ||
856 | if (!PageSwapCache(page)) | ||
857 | return 0; | ||
858 | |||
859 | ent.val = page_private(page); | ||
860 | |||
861 | mem = lookup_swap_cgroup(ent); | ||
862 | if (!mem || mem->obsolete) | ||
863 | goto charge_cur_mm; | ||
864 | *ptr = mem; | ||
865 | return __mem_cgroup_try_charge(NULL, mask, ptr, true); | ||
866 | charge_cur_mm: | ||
867 | if (unlikely(!mm)) | ||
868 | mm = &init_mm; | ||
869 | return __mem_cgroup_try_charge(mm, mask, ptr, true); | ||
870 | } | ||
871 | |||
794 | #ifdef CONFIG_SWAP | 872 | #ifdef CONFIG_SWAP |
873 | |||
795 | int mem_cgroup_cache_charge_swapin(struct page *page, | 874 | int mem_cgroup_cache_charge_swapin(struct page *page, |
796 | struct mm_struct *mm, gfp_t mask, bool locked) | 875 | struct mm_struct *mm, gfp_t mask, bool locked) |
797 | { | 876 | { |
@@ -808,8 +887,28 @@ int mem_cgroup_cache_charge_swapin(struct page *page, | |||
808 | * we reach here. | 887 | * we reach here. |
809 | */ | 888 | */ |
810 | if (PageSwapCache(page)) { | 889 | if (PageSwapCache(page)) { |
890 | struct mem_cgroup *mem = NULL; | ||
891 | swp_entry_t ent; | ||
892 | |||
893 | ent.val = page_private(page); | ||
894 | if (do_swap_account) { | ||
895 | mem = lookup_swap_cgroup(ent); | ||
896 | if (mem && mem->obsolete) | ||
897 | mem = NULL; | ||
898 | if (mem) | ||
899 | mm = NULL; | ||
900 | } | ||
811 | ret = mem_cgroup_charge_common(page, mm, mask, | 901 | ret = mem_cgroup_charge_common(page, mm, mask, |
812 | MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); | 902 | MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); |
903 | |||
904 | if (!ret && do_swap_account) { | ||
905 | /* avoid double counting */ | ||
906 | mem = swap_cgroup_record(ent, NULL); | ||
907 | if (mem) { | ||
908 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
909 | mem_cgroup_put(mem); | ||
910 | } | ||
911 | } | ||
813 | } | 912 | } |
814 | if (!locked) | 913 | if (!locked) |
815 | unlock_page(page); | 914 | unlock_page(page); |
@@ -828,6 +927,23 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | |||
828 | return; | 927 | return; |
829 | pc = lookup_page_cgroup(page); | 928 | pc = lookup_page_cgroup(page); |
830 | __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); | 929 | __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); |
930 | /* | ||
931 | * Now swap is on-memory. This means this page may be | ||
932 | * counted both as mem and swap....double count. | ||
933 | * Fix it by uncharging from memsw. This SwapCache is stable | ||
934 | * because we're still under lock_page(). | ||
935 | */ | ||
936 | if (do_swap_account) { | ||
937 | swp_entry_t ent = {.val = page_private(page)}; | ||
938 | struct mem_cgroup *memcg; | ||
939 | memcg = swap_cgroup_record(ent, NULL); | ||
940 | if (memcg) { | ||
941 | /* If memcg is obsolete, memcg can be != ptr */ | ||
942 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | ||
943 | mem_cgroup_put(memcg); | ||
944 | } | ||
945 | |||
946 | } | ||
831 | } | 947 | } |
832 | 948 | ||
833 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | 949 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) |
@@ -837,6 +953,8 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
837 | if (!mem) | 953 | if (!mem) |
838 | return; | 954 | return; |
839 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 955 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
956 | if (do_swap_account) | ||
957 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
840 | css_put(&mem->css); | 958 | css_put(&mem->css); |
841 | } | 959 | } |
842 | 960 | ||
@@ -844,29 +962,31 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
844 | /* | 962 | /* |
845 | * uncharge if !page_mapped(page) | 963 | * uncharge if !page_mapped(page) |
846 | */ | 964 | */ |
847 | static void | 965 | static struct mem_cgroup * |
848 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 966 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) |
849 | { | 967 | { |
850 | struct page_cgroup *pc; | 968 | struct page_cgroup *pc; |
851 | struct mem_cgroup *mem; | 969 | struct mem_cgroup *mem = NULL; |
852 | struct mem_cgroup_per_zone *mz; | 970 | struct mem_cgroup_per_zone *mz; |
853 | unsigned long flags; | 971 | unsigned long flags; |
854 | 972 | ||
855 | if (mem_cgroup_subsys.disabled) | 973 | if (mem_cgroup_subsys.disabled) |
856 | return; | 974 | return NULL; |
857 | 975 | ||
858 | if (PageSwapCache(page)) | 976 | if (PageSwapCache(page)) |
859 | return; | 977 | return NULL; |
860 | 978 | ||
861 | /* | 979 | /* |
862 | * Check if our page_cgroup is valid | 980 | * Check if our page_cgroup is valid |
863 | */ | 981 | */ |
864 | pc = lookup_page_cgroup(page); | 982 | pc = lookup_page_cgroup(page); |
865 | if (unlikely(!pc || !PageCgroupUsed(pc))) | 983 | if (unlikely(!pc || !PageCgroupUsed(pc))) |
866 | return; | 984 | return NULL; |
867 | 985 | ||
868 | lock_page_cgroup(pc); | 986 | lock_page_cgroup(pc); |
869 | 987 | ||
988 | mem = pc->mem_cgroup; | ||
989 | |||
870 | if (!PageCgroupUsed(pc)) | 990 | if (!PageCgroupUsed(pc)) |
871 | goto unlock_out; | 991 | goto unlock_out; |
872 | 992 | ||
@@ -886,8 +1006,11 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
886 | break; | 1006 | break; |
887 | } | 1007 | } |
888 | 1008 | ||
1009 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1010 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
1011 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1012 | |||
889 | ClearPageCgroupUsed(pc); | 1013 | ClearPageCgroupUsed(pc); |
890 | mem = pc->mem_cgroup; | ||
891 | 1014 | ||
892 | mz = page_cgroup_zoneinfo(pc); | 1015 | mz = page_cgroup_zoneinfo(pc); |
893 | spin_lock_irqsave(&mz->lru_lock, flags); | 1016 | spin_lock_irqsave(&mz->lru_lock, flags); |
@@ -895,14 +1018,13 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
895 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 1018 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
896 | unlock_page_cgroup(pc); | 1019 | unlock_page_cgroup(pc); |
897 | 1020 | ||
898 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
899 | css_put(&mem->css); | 1021 | css_put(&mem->css); |
900 | 1022 | ||
901 | return; | 1023 | return mem; |
902 | 1024 | ||
903 | unlock_out: | 1025 | unlock_out: |
904 | unlock_page_cgroup(pc); | 1026 | unlock_page_cgroup(pc); |
905 | return; | 1027 | return NULL; |
906 | } | 1028 | } |
907 | 1029 | ||
908 | void mem_cgroup_uncharge_page(struct page *page) | 1030 | void mem_cgroup_uncharge_page(struct page *page) |
@@ -922,10 +1044,42 @@ void mem_cgroup_uncharge_cache_page(struct page *page) | |||
922 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 1044 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); |
923 | } | 1045 | } |
924 | 1046 | ||
925 | void mem_cgroup_uncharge_swapcache(struct page *page) | 1047 | /* |
1048 | * called from __delete_from_swap_cache() and drop "page" account. | ||
1049 | * memcg information is recorded to swap_cgroup of "ent" | ||
1050 | */ | ||
1051 | void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) | ||
1052 | { | ||
1053 | struct mem_cgroup *memcg; | ||
1054 | |||
1055 | memcg = __mem_cgroup_uncharge_common(page, | ||
1056 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT); | ||
1057 | /* record memcg information */ | ||
1058 | if (do_swap_account && memcg) { | ||
1059 | swap_cgroup_record(ent, memcg); | ||
1060 | mem_cgroup_get(memcg); | ||
1061 | } | ||
1062 | } | ||
1063 | |||
1064 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
1065 | /* | ||
1066 | * called from swap_entry_free(). remove record in swap_cgroup and | ||
1067 | * uncharge "memsw" account. | ||
1068 | */ | ||
1069 | void mem_cgroup_uncharge_swap(swp_entry_t ent) | ||
926 | { | 1070 | { |
927 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_SWAPOUT); | 1071 | struct mem_cgroup *memcg; |
1072 | |||
1073 | if (!do_swap_account) | ||
1074 | return; | ||
1075 | |||
1076 | memcg = swap_cgroup_record(ent, NULL); | ||
1077 | if (memcg) { | ||
1078 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | ||
1079 | mem_cgroup_put(memcg); | ||
1080 | } | ||
928 | } | 1081 | } |
1082 | #endif | ||
929 | 1083 | ||
930 | /* | 1084 | /* |
931 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old | 1085 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old |
@@ -1034,7 +1188,7 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) | |||
1034 | rcu_read_unlock(); | 1188 | rcu_read_unlock(); |
1035 | 1189 | ||
1036 | do { | 1190 | do { |
1037 | progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); | 1191 | progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true); |
1038 | progress += res_counter_check_under_limit(&mem->res); | 1192 | progress += res_counter_check_under_limit(&mem->res); |
1039 | } while (!progress && --retry); | 1193 | } while (!progress && --retry); |
1040 | 1194 | ||
@@ -1044,26 +1198,84 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) | |||
1044 | return 0; | 1198 | return 0; |
1045 | } | 1199 | } |
1046 | 1200 | ||
1201 | static DEFINE_MUTEX(set_limit_mutex); | ||
1202 | |||
1047 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | 1203 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, |
1048 | unsigned long long val) | 1204 | unsigned long long val) |
1049 | { | 1205 | { |
1050 | 1206 | ||
1051 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; | 1207 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; |
1052 | int progress; | 1208 | int progress; |
1209 | u64 memswlimit; | ||
1053 | int ret = 0; | 1210 | int ret = 0; |
1054 | 1211 | ||
1055 | while (res_counter_set_limit(&memcg->res, val)) { | 1212 | while (retry_count) { |
1056 | if (signal_pending(current)) { | 1213 | if (signal_pending(current)) { |
1057 | ret = -EINTR; | 1214 | ret = -EINTR; |
1058 | break; | 1215 | break; |
1059 | } | 1216 | } |
1060 | if (!retry_count) { | 1217 | /* |
1061 | ret = -EBUSY; | 1218 | * Rather than hide all in some function, I do this in |
1219 | * open coded manner. You see what this really does. | ||
1220 | * We have to guarantee mem->res.limit < mem->memsw.limit. | ||
1221 | */ | ||
1222 | mutex_lock(&set_limit_mutex); | ||
1223 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
1224 | if (memswlimit < val) { | ||
1225 | ret = -EINVAL; | ||
1226 | mutex_unlock(&set_limit_mutex); | ||
1062 | break; | 1227 | break; |
1063 | } | 1228 | } |
1229 | ret = res_counter_set_limit(&memcg->res, val); | ||
1230 | mutex_unlock(&set_limit_mutex); | ||
1231 | |||
1232 | if (!ret) | ||
1233 | break; | ||
1234 | |||
1064 | progress = try_to_free_mem_cgroup_pages(memcg, | 1235 | progress = try_to_free_mem_cgroup_pages(memcg, |
1065 | GFP_HIGHUSER_MOVABLE); | 1236 | GFP_HIGHUSER_MOVABLE, false); |
1066 | if (!progress) | 1237 | if (!progress) retry_count--; |
1238 | } | ||
1239 | return ret; | ||
1240 | } | ||
1241 | |||
1242 | int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | ||
1243 | unsigned long long val) | ||
1244 | { | ||
1245 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; | ||
1246 | u64 memlimit, oldusage, curusage; | ||
1247 | int ret; | ||
1248 | |||
1249 | if (!do_swap_account) | ||
1250 | return -EINVAL; | ||
1251 | |||
1252 | while (retry_count) { | ||
1253 | if (signal_pending(current)) { | ||
1254 | ret = -EINTR; | ||
1255 | break; | ||
1256 | } | ||
1257 | /* | ||
1258 | * Rather than hide all in some function, I do this in | ||
1259 | * open coded manner. You see what this really does. | ||
1260 | * We have to guarantee mem->res.limit < mem->memsw.limit. | ||
1261 | */ | ||
1262 | mutex_lock(&set_limit_mutex); | ||
1263 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
1264 | if (memlimit > val) { | ||
1265 | ret = -EINVAL; | ||
1266 | mutex_unlock(&set_limit_mutex); | ||
1267 | break; | ||
1268 | } | ||
1269 | ret = res_counter_set_limit(&memcg->memsw, val); | ||
1270 | mutex_unlock(&set_limit_mutex); | ||
1271 | |||
1272 | if (!ret) | ||
1273 | break; | ||
1274 | |||
1275 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | ||
1276 | try_to_free_mem_cgroup_pages(memcg, GFP_HIGHUSER_MOVABLE, true); | ||
1277 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | ||
1278 | if (curusage >= oldusage) | ||
1067 | retry_count--; | 1279 | retry_count--; |
1068 | } | 1280 | } |
1069 | return ret; | 1281 | return ret; |
@@ -1193,7 +1405,7 @@ try_to_free: | |||
1193 | goto out; | 1405 | goto out; |
1194 | } | 1406 | } |
1195 | progress = try_to_free_mem_cgroup_pages(mem, | 1407 | progress = try_to_free_mem_cgroup_pages(mem, |
1196 | GFP_HIGHUSER_MOVABLE); | 1408 | GFP_HIGHUSER_MOVABLE, false); |
1197 | if (!progress) { | 1409 | if (!progress) { |
1198 | nr_retries--; | 1410 | nr_retries--; |
1199 | /* maybe some writeback is necessary */ | 1411 | /* maybe some writeback is necessary */ |
@@ -1216,8 +1428,25 @@ int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) | |||
1216 | 1428 | ||
1217 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 1429 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
1218 | { | 1430 | { |
1219 | return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, | 1431 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
1220 | cft->private); | 1432 | u64 val = 0; |
1433 | int type, name; | ||
1434 | |||
1435 | type = MEMFILE_TYPE(cft->private); | ||
1436 | name = MEMFILE_ATTR(cft->private); | ||
1437 | switch (type) { | ||
1438 | case _MEM: | ||
1439 | val = res_counter_read_u64(&mem->res, name); | ||
1440 | break; | ||
1441 | case _MEMSWAP: | ||
1442 | if (do_swap_account) | ||
1443 | val = res_counter_read_u64(&mem->memsw, name); | ||
1444 | break; | ||
1445 | default: | ||
1446 | BUG(); | ||
1447 | break; | ||
1448 | } | ||
1449 | return val; | ||
1221 | } | 1450 | } |
1222 | /* | 1451 | /* |
1223 | * The user of this function is... | 1452 | * The user of this function is... |
@@ -1227,15 +1456,22 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
1227 | const char *buffer) | 1456 | const char *buffer) |
1228 | { | 1457 | { |
1229 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 1458 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
1459 | int type, name; | ||
1230 | unsigned long long val; | 1460 | unsigned long long val; |
1231 | int ret; | 1461 | int ret; |
1232 | 1462 | ||
1233 | switch (cft->private) { | 1463 | type = MEMFILE_TYPE(cft->private); |
1464 | name = MEMFILE_ATTR(cft->private); | ||
1465 | switch (name) { | ||
1234 | case RES_LIMIT: | 1466 | case RES_LIMIT: |
1235 | /* This function does all necessary parse...reuse it */ | 1467 | /* This function does all necessary parse...reuse it */ |
1236 | ret = res_counter_memparse_write_strategy(buffer, &val); | 1468 | ret = res_counter_memparse_write_strategy(buffer, &val); |
1237 | if (!ret) | 1469 | if (ret) |
1470 | break; | ||
1471 | if (type == _MEM) | ||
1238 | ret = mem_cgroup_resize_limit(memcg, val); | 1472 | ret = mem_cgroup_resize_limit(memcg, val); |
1473 | else | ||
1474 | ret = mem_cgroup_resize_memsw_limit(memcg, val); | ||
1239 | break; | 1475 | break; |
1240 | default: | 1476 | default: |
1241 | ret = -EINVAL; /* should be BUG() ? */ | 1477 | ret = -EINVAL; /* should be BUG() ? */ |
@@ -1247,14 +1483,23 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
1247 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | 1483 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) |
1248 | { | 1484 | { |
1249 | struct mem_cgroup *mem; | 1485 | struct mem_cgroup *mem; |
1486 | int type, name; | ||
1250 | 1487 | ||
1251 | mem = mem_cgroup_from_cont(cont); | 1488 | mem = mem_cgroup_from_cont(cont); |
1252 | switch (event) { | 1489 | type = MEMFILE_TYPE(event); |
1490 | name = MEMFILE_ATTR(event); | ||
1491 | switch (name) { | ||
1253 | case RES_MAX_USAGE: | 1492 | case RES_MAX_USAGE: |
1254 | res_counter_reset_max(&mem->res); | 1493 | if (type == _MEM) |
1494 | res_counter_reset_max(&mem->res); | ||
1495 | else | ||
1496 | res_counter_reset_max(&mem->memsw); | ||
1255 | break; | 1497 | break; |
1256 | case RES_FAILCNT: | 1498 | case RES_FAILCNT: |
1257 | res_counter_reset_failcnt(&mem->res); | 1499 | if (type == _MEM) |
1500 | res_counter_reset_failcnt(&mem->res); | ||
1501 | else | ||
1502 | res_counter_reset_failcnt(&mem->memsw); | ||
1258 | break; | 1503 | break; |
1259 | } | 1504 | } |
1260 | return 0; | 1505 | return 0; |
@@ -1315,24 +1560,24 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
1315 | static struct cftype mem_cgroup_files[] = { | 1560 | static struct cftype mem_cgroup_files[] = { |
1316 | { | 1561 | { |
1317 | .name = "usage_in_bytes", | 1562 | .name = "usage_in_bytes", |
1318 | .private = RES_USAGE, | 1563 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
1319 | .read_u64 = mem_cgroup_read, | 1564 | .read_u64 = mem_cgroup_read, |
1320 | }, | 1565 | }, |
1321 | { | 1566 | { |
1322 | .name = "max_usage_in_bytes", | 1567 | .name = "max_usage_in_bytes", |
1323 | .private = RES_MAX_USAGE, | 1568 | .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), |
1324 | .trigger = mem_cgroup_reset, | 1569 | .trigger = mem_cgroup_reset, |
1325 | .read_u64 = mem_cgroup_read, | 1570 | .read_u64 = mem_cgroup_read, |
1326 | }, | 1571 | }, |
1327 | { | 1572 | { |
1328 | .name = "limit_in_bytes", | 1573 | .name = "limit_in_bytes", |
1329 | .private = RES_LIMIT, | 1574 | .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), |
1330 | .write_string = mem_cgroup_write, | 1575 | .write_string = mem_cgroup_write, |
1331 | .read_u64 = mem_cgroup_read, | 1576 | .read_u64 = mem_cgroup_read, |
1332 | }, | 1577 | }, |
1333 | { | 1578 | { |
1334 | .name = "failcnt", | 1579 | .name = "failcnt", |
1335 | .private = RES_FAILCNT, | 1580 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), |
1336 | .trigger = mem_cgroup_reset, | 1581 | .trigger = mem_cgroup_reset, |
1337 | .read_u64 = mem_cgroup_read, | 1582 | .read_u64 = mem_cgroup_read, |
1338 | }, | 1583 | }, |
@@ -1346,6 +1591,47 @@ static struct cftype mem_cgroup_files[] = { | |||
1346 | }, | 1591 | }, |
1347 | }; | 1592 | }; |
1348 | 1593 | ||
1594 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
1595 | static struct cftype memsw_cgroup_files[] = { | ||
1596 | { | ||
1597 | .name = "memsw.usage_in_bytes", | ||
1598 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | ||
1599 | .read_u64 = mem_cgroup_read, | ||
1600 | }, | ||
1601 | { | ||
1602 | .name = "memsw.max_usage_in_bytes", | ||
1603 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | ||
1604 | .trigger = mem_cgroup_reset, | ||
1605 | .read_u64 = mem_cgroup_read, | ||
1606 | }, | ||
1607 | { | ||
1608 | .name = "memsw.limit_in_bytes", | ||
1609 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | ||
1610 | .write_string = mem_cgroup_write, | ||
1611 | .read_u64 = mem_cgroup_read, | ||
1612 | }, | ||
1613 | { | ||
1614 | .name = "memsw.failcnt", | ||
1615 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | ||
1616 | .trigger = mem_cgroup_reset, | ||
1617 | .read_u64 = mem_cgroup_read, | ||
1618 | }, | ||
1619 | }; | ||
1620 | |||
1621 | static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) | ||
1622 | { | ||
1623 | if (!do_swap_account) | ||
1624 | return 0; | ||
1625 | return cgroup_add_files(cont, ss, memsw_cgroup_files, | ||
1626 | ARRAY_SIZE(memsw_cgroup_files)); | ||
1627 | }; | ||
1628 | #else | ||
1629 | static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) | ||
1630 | { | ||
1631 | return 0; | ||
1632 | } | ||
1633 | #endif | ||
1634 | |||
1349 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | 1635 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) |
1350 | { | 1636 | { |
1351 | struct mem_cgroup_per_node *pn; | 1637 | struct mem_cgroup_per_node *pn; |
@@ -1404,14 +1690,44 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
1404 | return mem; | 1690 | return mem; |
1405 | } | 1691 | } |
1406 | 1692 | ||
1693 | /* | ||
1694 | * At destroying mem_cgroup, references from swap_cgroup can remain. | ||
1695 | * (scanning all at force_empty is too costly...) | ||
1696 | * | ||
1697 | * Instead of clearing all references at force_empty, we remember | ||
1698 | * the number of reference from swap_cgroup and free mem_cgroup when | ||
1699 | * it goes down to 0. | ||
1700 | * | ||
1701 | * When mem_cgroup is destroyed, mem->obsolete will be set to 0 and | ||
1702 | * entry which points to this memcg will be ignore at swapin. | ||
1703 | * | ||
1704 | * Removal of cgroup itself succeeds regardless of refs from swap. | ||
1705 | */ | ||
1706 | |||
1407 | static void mem_cgroup_free(struct mem_cgroup *mem) | 1707 | static void mem_cgroup_free(struct mem_cgroup *mem) |
1408 | { | 1708 | { |
1709 | if (atomic_read(&mem->refcnt) > 0) | ||
1710 | return; | ||
1409 | if (mem_cgroup_size() < PAGE_SIZE) | 1711 | if (mem_cgroup_size() < PAGE_SIZE) |
1410 | kfree(mem); | 1712 | kfree(mem); |
1411 | else | 1713 | else |
1412 | vfree(mem); | 1714 | vfree(mem); |
1413 | } | 1715 | } |
1414 | 1716 | ||
1717 | static void mem_cgroup_get(struct mem_cgroup *mem) | ||
1718 | { | ||
1719 | atomic_inc(&mem->refcnt); | ||
1720 | } | ||
1721 | |||
1722 | static void mem_cgroup_put(struct mem_cgroup *mem) | ||
1723 | { | ||
1724 | if (atomic_dec_and_test(&mem->refcnt)) { | ||
1725 | if (!mem->obsolete) | ||
1726 | return; | ||
1727 | mem_cgroup_free(mem); | ||
1728 | } | ||
1729 | } | ||
1730 | |||
1415 | 1731 | ||
1416 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 1732 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
1417 | static void __init enable_swap_cgroup(void) | 1733 | static void __init enable_swap_cgroup(void) |
@@ -1436,6 +1752,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1436 | return ERR_PTR(-ENOMEM); | 1752 | return ERR_PTR(-ENOMEM); |
1437 | 1753 | ||
1438 | res_counter_init(&mem->res); | 1754 | res_counter_init(&mem->res); |
1755 | res_counter_init(&mem->memsw); | ||
1439 | 1756 | ||
1440 | for_each_node_state(node, N_POSSIBLE) | 1757 | for_each_node_state(node, N_POSSIBLE) |
1441 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | 1758 | if (alloc_mem_cgroup_per_zone_info(mem, node)) |
@@ -1456,6 +1773,7 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, | |||
1456 | struct cgroup *cont) | 1773 | struct cgroup *cont) |
1457 | { | 1774 | { |
1458 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 1775 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
1776 | mem->obsolete = 1; | ||
1459 | mem_cgroup_force_empty(mem, false); | 1777 | mem_cgroup_force_empty(mem, false); |
1460 | } | 1778 | } |
1461 | 1779 | ||
@@ -1474,8 +1792,14 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, | |||
1474 | static int mem_cgroup_populate(struct cgroup_subsys *ss, | 1792 | static int mem_cgroup_populate(struct cgroup_subsys *ss, |
1475 | struct cgroup *cont) | 1793 | struct cgroup *cont) |
1476 | { | 1794 | { |
1477 | return cgroup_add_files(cont, ss, mem_cgroup_files, | 1795 | int ret; |
1478 | ARRAY_SIZE(mem_cgroup_files)); | 1796 | |
1797 | ret = cgroup_add_files(cont, ss, mem_cgroup_files, | ||
1798 | ARRAY_SIZE(mem_cgroup_files)); | ||
1799 | |||
1800 | if (!ret) | ||
1801 | ret = register_memsw_files(cont, ss); | ||
1802 | return ret; | ||
1479 | } | 1803 | } |
1480 | 1804 | ||
1481 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 1805 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
diff --git a/mm/memory.c b/mm/memory.c index ba5189e322e6..1358012ffa73 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -2431,7 +2431,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2431 | lock_page(page); | 2431 | lock_page(page); |
2432 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2432 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2433 | 2433 | ||
2434 | if (mem_cgroup_try_charge(mm, GFP_HIGHUSER_MOVABLE, &ptr) == -ENOMEM) { | 2434 | if (mem_cgroup_try_charge_swapin(mm, page, |
2435 | GFP_HIGHUSER_MOVABLE, &ptr) == -ENOMEM) { | ||
2435 | ret = VM_FAULT_OOM; | 2436 | ret = VM_FAULT_OOM; |
2436 | unlock_page(page); | 2437 | unlock_page(page); |
2437 | goto out; | 2438 | goto out; |
@@ -2449,8 +2450,20 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2449 | goto out_nomap; | 2450 | goto out_nomap; |
2450 | } | 2451 | } |
2451 | 2452 | ||
2452 | /* The page isn't present yet, go ahead with the fault. */ | 2453 | /* |
2454 | * The page isn't present yet, go ahead with the fault. | ||
2455 | * | ||
2456 | * Be careful about the sequence of operations here. | ||
2457 | * To get its accounting right, reuse_swap_page() must be called | ||
2458 | * while the page is counted on swap but not yet in mapcount i.e. | ||
2459 | * before page_add_anon_rmap() and swap_free(); try_to_free_swap() | ||
2460 | * must be called after the swap_free(), or it will never succeed. | ||
2461 | * And mem_cgroup_commit_charge_swapin(), which uses the swp_entry | ||
2462 | * in page->private, must be called before reuse_swap_page(), | ||
2463 | * which may delete_from_swap_cache(). | ||
2464 | */ | ||
2453 | 2465 | ||
2466 | mem_cgroup_commit_charge_swapin(page, ptr); | ||
2454 | inc_mm_counter(mm, anon_rss); | 2467 | inc_mm_counter(mm, anon_rss); |
2455 | pte = mk_pte(page, vma->vm_page_prot); | 2468 | pte = mk_pte(page, vma->vm_page_prot); |
2456 | if (write_access && reuse_swap_page(page)) { | 2469 | if (write_access && reuse_swap_page(page)) { |
@@ -2461,7 +2474,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2461 | flush_icache_page(vma, page); | 2474 | flush_icache_page(vma, page); |
2462 | set_pte_at(mm, address, page_table, pte); | 2475 | set_pte_at(mm, address, page_table, pte); |
2463 | page_add_anon_rmap(page, vma, address); | 2476 | page_add_anon_rmap(page, vma, address); |
2464 | mem_cgroup_commit_charge_swapin(page, ptr); | ||
2465 | 2477 | ||
2466 | swap_free(entry); | 2478 | swap_free(entry); |
2467 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) | 2479 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 09291ca11f5f..3ecea98ecb45 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/backing-dev.h> | 17 | #include <linux/backing-dev.h> |
18 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
19 | #include <linux/migrate.h> | 19 | #include <linux/migrate.h> |
20 | #include <linux/page_cgroup.h> | ||
20 | 21 | ||
21 | #include <asm/pgtable.h> | 22 | #include <asm/pgtable.h> |
22 | 23 | ||
@@ -108,6 +109,8 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
108 | */ | 109 | */ |
109 | void __delete_from_swap_cache(struct page *page) | 110 | void __delete_from_swap_cache(struct page *page) |
110 | { | 111 | { |
112 | swp_entry_t ent = {.val = page_private(page)}; | ||
113 | |||
111 | VM_BUG_ON(!PageLocked(page)); | 114 | VM_BUG_ON(!PageLocked(page)); |
112 | VM_BUG_ON(!PageSwapCache(page)); | 115 | VM_BUG_ON(!PageSwapCache(page)); |
113 | VM_BUG_ON(PageWriteback(page)); | 116 | VM_BUG_ON(PageWriteback(page)); |
@@ -118,7 +121,7 @@ void __delete_from_swap_cache(struct page *page) | |||
118 | total_swapcache_pages--; | 121 | total_swapcache_pages--; |
119 | __dec_zone_page_state(page, NR_FILE_PAGES); | 122 | __dec_zone_page_state(page, NR_FILE_PAGES); |
120 | INC_CACHE_INFO(del_total); | 123 | INC_CACHE_INFO(del_total); |
121 | mem_cgroup_uncharge_swapcache(page); | 124 | mem_cgroup_uncharge_swapcache(page, ent); |
122 | } | 125 | } |
123 | 126 | ||
124 | /** | 127 | /** |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 1e7a715a3866..0579d9069b61 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -471,8 +471,9 @@ out: | |||
471 | return NULL; | 471 | return NULL; |
472 | } | 472 | } |
473 | 473 | ||
474 | static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) | 474 | static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) |
475 | { | 475 | { |
476 | unsigned long offset = swp_offset(ent); | ||
476 | int count = p->swap_map[offset]; | 477 | int count = p->swap_map[offset]; |
477 | 478 | ||
478 | if (count < SWAP_MAP_MAX) { | 479 | if (count < SWAP_MAP_MAX) { |
@@ -487,6 +488,7 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) | |||
487 | swap_list.next = p - swap_info; | 488 | swap_list.next = p - swap_info; |
488 | nr_swap_pages++; | 489 | nr_swap_pages++; |
489 | p->inuse_pages--; | 490 | p->inuse_pages--; |
491 | mem_cgroup_uncharge_swap(ent); | ||
490 | } | 492 | } |
491 | } | 493 | } |
492 | return count; | 494 | return count; |
@@ -502,7 +504,7 @@ void swap_free(swp_entry_t entry) | |||
502 | 504 | ||
503 | p = swap_info_get(entry); | 505 | p = swap_info_get(entry); |
504 | if (p) { | 506 | if (p) { |
505 | swap_entry_free(p, swp_offset(entry)); | 507 | swap_entry_free(p, entry); |
506 | spin_unlock(&swap_lock); | 508 | spin_unlock(&swap_lock); |
507 | } | 509 | } |
508 | } | 510 | } |
@@ -582,7 +584,7 @@ int free_swap_and_cache(swp_entry_t entry) | |||
582 | 584 | ||
583 | p = swap_info_get(entry); | 585 | p = swap_info_get(entry); |
584 | if (p) { | 586 | if (p) { |
585 | if (swap_entry_free(p, swp_offset(entry)) == 1) { | 587 | if (swap_entry_free(p, entry) == 1) { |
586 | page = find_get_page(&swapper_space, entry.val); | 588 | page = find_get_page(&swapper_space, entry.val); |
587 | if (page && !trylock_page(page)) { | 589 | if (page && !trylock_page(page)) { |
588 | page_cache_release(page); | 590 | page_cache_release(page); |
@@ -696,7 +698,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
696 | pte_t *pte; | 698 | pte_t *pte; |
697 | int ret = 1; | 699 | int ret = 1; |
698 | 700 | ||
699 | if (mem_cgroup_try_charge(vma->vm_mm, GFP_HIGHUSER_MOVABLE, &ptr)) | 701 | if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, |
702 | GFP_HIGHUSER_MOVABLE, &ptr)) | ||
700 | ret = -ENOMEM; | 703 | ret = -ENOMEM; |
701 | 704 | ||
702 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 705 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index b07c48b09a93..f63b20dd7714 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1661,7 +1661,8 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
1661 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 1661 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
1662 | 1662 | ||
1663 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | 1663 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, |
1664 | gfp_t gfp_mask) | 1664 | gfp_t gfp_mask, |
1665 | bool noswap) | ||
1665 | { | 1666 | { |
1666 | struct scan_control sc = { | 1667 | struct scan_control sc = { |
1667 | .may_writepage = !laptop_mode, | 1668 | .may_writepage = !laptop_mode, |
@@ -1674,6 +1675,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1674 | }; | 1675 | }; |
1675 | struct zonelist *zonelist; | 1676 | struct zonelist *zonelist; |
1676 | 1677 | ||
1678 | if (noswap) | ||
1679 | sc.may_swap = 0; | ||
1680 | |||
1677 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 1681 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
1678 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 1682 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
1679 | zonelist = NODE_DATA(numa_node_id())->node_zonelists; | 1683 | zonelist = NODE_DATA(numa_node_id())->node_zonelists; |