aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBalbir Singh <balbir@linux.vnet.ibm.com>2008-02-07 03:13:56 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-07 11:42:18 -0500
commit66e1707bc34609f626e2e7b4fe7e454c9748bad5 (patch)
treed850a729887485874c976ba64eb85e3406e488a1
parent67e465a77ba658635309ee00b367bec6555ea544 (diff)
Memory controller: add per cgroup LRU and reclaim
Add the page_cgroup to the per cgroup LRU. The reclaim algorithm has been modified to make the isolate_lru_pages() as a pluggable component. The scan_control data structure now accepts the cgroup on behalf of which reclaims are carried out. try_to_free_pages() has been extended to become cgroup aware. [akpm@linux-foundation.org: fix warning] [Lee.Schermerhorn@hp.com: initialize all scan_control's isolate_pages member] [bunk@kernel.org: make do_try_to_free_pages() static] [hugh@veritas.com: memcgroup: fix try_to_free order] [kamezawa.hiroyu@jp.fujitsu.com: this unlock_page_cgroup() is unnecessary] Signed-off-by: Pavel Emelianov <xemul@openvz.org> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Paul Menage <menage@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Kirill Korotaev <dev@sw.ru> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: David Rientjes <rientjes@google.com> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/memcontrol.h12
-rw-r--r--include/linux/res_counter.h23
-rw-r--r--include/linux/swap.h3
-rw-r--r--mm/memcontrol.c148
-rw-r--r--mm/swap.c2
-rw-r--r--mm/vmscan.c128
6 files changed, 286 insertions, 30 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f5b47efab48b..9c3c1c97c197 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -32,6 +32,13 @@ extern void page_assign_page_cgroup(struct page *page,
32extern struct page_cgroup *page_get_page_cgroup(struct page *page); 32extern struct page_cgroup *page_get_page_cgroup(struct page *page);
33extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm); 33extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm);
34extern void mem_cgroup_uncharge(struct page_cgroup *pc); 34extern void mem_cgroup_uncharge(struct page_cgroup *pc);
35extern void mem_cgroup_move_lists(struct page_cgroup *pc, bool active);
36extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
37 struct list_head *dst,
38 unsigned long *scanned, int order,
39 int mode, struct zone *z,
40 struct mem_cgroup *mem_cont,
41 int active);
35 42
36static inline void mem_cgroup_uncharge_page(struct page *page) 43static inline void mem_cgroup_uncharge_page(struct page *page)
37{ 44{
@@ -71,6 +78,11 @@ static inline void mem_cgroup_uncharge_page(struct page *page)
71{ 78{
72} 79}
73 80
81static inline void mem_cgroup_move_lists(struct page_cgroup *pc,
82 bool active)
83{
84}
85
74#endif /* CONFIG_CGROUP_MEM_CONT */ 86#endif /* CONFIG_CGROUP_MEM_CONT */
75 87
76#endif /* _LINUX_MEMCONTROL_H */ 88#endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index eeb3f7749772..5e60a4f34243 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -99,4 +99,27 @@ int res_counter_charge(struct res_counter *counter, unsigned long val);
99void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); 99void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
100void res_counter_uncharge(struct res_counter *counter, unsigned long val); 100void res_counter_uncharge(struct res_counter *counter, unsigned long val);
101 101
102static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
103{
104 if (cnt->usage < cnt->limit)
105 return true;
106
107 return false;
108}
109
110/*
111 * Helper function to detect if the cgroup is within it's limit or
112 * not. It's currently called from cgroup_rss_prepare()
113 */
114static inline bool res_counter_check_under_limit(struct res_counter *cnt)
115{
116 bool ret;
117 unsigned long flags;
118
119 spin_lock_irqsave(&cnt->lock, flags);
120 ret = res_counter_limit_check_locked(cnt);
121 spin_unlock_irqrestore(&cnt->lock, flags);
122 return ret;
123}
124
102#endif 125#endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 353153ea0bd5..4d91bc0e0fd5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -5,6 +5,7 @@
5#include <linux/linkage.h> 5#include <linux/linkage.h>
6#include <linux/mmzone.h> 6#include <linux/mmzone.h>
7#include <linux/list.h> 7#include <linux/list.h>
8#include <linux/memcontrol.h>
8#include <linux/sched.h> 9#include <linux/sched.h>
9 10
10#include <asm/atomic.h> 11#include <asm/atomic.h>
@@ -182,6 +183,8 @@ extern void swap_setup(void);
182/* linux/mm/vmscan.c */ 183/* linux/mm/vmscan.c */
183extern unsigned long try_to_free_pages(struct zone **zones, int order, 184extern unsigned long try_to_free_pages(struct zone **zones, int order,
184 gfp_t gfp_mask); 185 gfp_t gfp_mask);
186extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem);
187extern int __isolate_lru_page(struct page *page, int mode);
185extern unsigned long shrink_all_memory(unsigned long nr_pages); 188extern unsigned long shrink_all_memory(unsigned long nr_pages);
186extern int vm_swappiness; 189extern int vm_swappiness;
187extern int remove_mapping(struct address_space *mapping, struct page *page); 190extern int remove_mapping(struct address_space *mapping, struct page *page);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b25df2a9d024..9e9ff914c0f1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -22,10 +22,15 @@
22#include <linux/cgroup.h> 22#include <linux/cgroup.h>
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/page-flags.h> 24#include <linux/page-flags.h>
25#include <linux/backing-dev.h>
25#include <linux/bit_spinlock.h> 26#include <linux/bit_spinlock.h>
26#include <linux/rcupdate.h> 27#include <linux/rcupdate.h>
28#include <linux/swap.h>
29#include <linux/spinlock.h>
30#include <linux/fs.h>
27 31
28struct cgroup_subsys mem_cgroup_subsys; 32struct cgroup_subsys mem_cgroup_subsys;
33static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
29 34
30/* 35/*
31 * The memory controller data structure. The memory controller controls both 36 * The memory controller data structure. The memory controller controls both
@@ -51,6 +56,10 @@ struct mem_cgroup {
51 */ 56 */
52 struct list_head active_list; 57 struct list_head active_list;
53 struct list_head inactive_list; 58 struct list_head inactive_list;
59 /*
60 * spin_lock to protect the per cgroup LRU
61 */
62 spinlock_t lru_lock;
54}; 63};
55 64
56/* 65/*
@@ -141,6 +150,94 @@ void __always_inline unlock_page_cgroup(struct page *page)
141 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 150 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
142} 151}
143 152
153void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
154{
155 if (active)
156 list_move(&pc->lru, &pc->mem_cgroup->active_list);
157 else
158 list_move(&pc->lru, &pc->mem_cgroup->inactive_list);
159}
160
161/*
162 * This routine assumes that the appropriate zone's lru lock is already held
163 */
164void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
165{
166 struct mem_cgroup *mem;
167 if (!pc)
168 return;
169
170 mem = pc->mem_cgroup;
171
172 spin_lock(&mem->lru_lock);
173 __mem_cgroup_move_lists(pc, active);
174 spin_unlock(&mem->lru_lock);
175}
176
177unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
178 struct list_head *dst,
179 unsigned long *scanned, int order,
180 int mode, struct zone *z,
181 struct mem_cgroup *mem_cont,
182 int active)
183{
184 unsigned long nr_taken = 0;
185 struct page *page;
186 unsigned long scan;
187 LIST_HEAD(pc_list);
188 struct list_head *src;
189 struct page_cgroup *pc;
190
191 if (active)
192 src = &mem_cont->active_list;
193 else
194 src = &mem_cont->inactive_list;
195
196 spin_lock(&mem_cont->lru_lock);
197 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
198 pc = list_entry(src->prev, struct page_cgroup, lru);
199 page = pc->page;
200 VM_BUG_ON(!pc);
201
202 if (PageActive(page) && !active) {
203 __mem_cgroup_move_lists(pc, true);
204 scan--;
205 continue;
206 }
207 if (!PageActive(page) && active) {
208 __mem_cgroup_move_lists(pc, false);
209 scan--;
210 continue;
211 }
212
213 /*
214 * Reclaim, per zone
215 * TODO: make the active/inactive lists per zone
216 */
217 if (page_zone(page) != z)
218 continue;
219
220 /*
221 * Check if the meta page went away from under us
222 */
223 if (!list_empty(&pc->lru))
224 list_move(&pc->lru, &pc_list);
225 else
226 continue;
227
228 if (__isolate_lru_page(page, mode) == 0) {
229 list_move(&page->lru, dst);
230 nr_taken++;
231 }
232 }
233
234 list_splice(&pc_list, src);
235 spin_unlock(&mem_cont->lru_lock);
236
237 *scanned = scan;
238 return nr_taken;
239}
240
144/* 241/*
145 * Charge the memory controller for page usage. 242 * Charge the memory controller for page usage.
146 * Return 243 * Return
@@ -151,6 +248,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
151{ 248{
152 struct mem_cgroup *mem; 249 struct mem_cgroup *mem;
153 struct page_cgroup *pc, *race_pc; 250 struct page_cgroup *pc, *race_pc;
251 unsigned long flags;
252 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
154 253
155 /* 254 /*
156 * Should page_cgroup's go to their own slab? 255 * Should page_cgroup's go to their own slab?
@@ -159,14 +258,20 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
159 * to see if the cgroup page already has a page_cgroup associated 258 * to see if the cgroup page already has a page_cgroup associated
160 * with it 259 * with it
161 */ 260 */
261retry:
162 lock_page_cgroup(page); 262 lock_page_cgroup(page);
163 pc = page_get_page_cgroup(page); 263 pc = page_get_page_cgroup(page);
164 /* 264 /*
165 * The page_cgroup exists and the page has already been accounted 265 * The page_cgroup exists and the page has already been accounted
166 */ 266 */
167 if (pc) { 267 if (pc) {
168 atomic_inc(&pc->ref_cnt); 268 if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
169 goto done; 269 /* this page is under being uncharged ? */
270 unlock_page_cgroup(page);
271 cpu_relax();
272 goto retry;
273 } else
274 goto done;
170 } 275 }
171 276
172 unlock_page_cgroup(page); 277 unlock_page_cgroup(page);
@@ -197,7 +302,32 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
197 * If we created the page_cgroup, we should free it on exceeding 302 * If we created the page_cgroup, we should free it on exceeding
198 * the cgroup limit. 303 * the cgroup limit.
199 */ 304 */
200 if (res_counter_charge(&mem->res, 1)) { 305 while (res_counter_charge(&mem->res, 1)) {
306 if (try_to_free_mem_cgroup_pages(mem))
307 continue;
308
309 /*
310 * try_to_free_mem_cgroup_pages() might not give us a full
311 * picture of reclaim. Some pages are reclaimed and might be
312 * moved to swap cache or just unmapped from the cgroup.
313 * Check the limit again to see if the reclaim reduced the
314 * current usage of the cgroup before giving up
315 */
316 if (res_counter_check_under_limit(&mem->res))
317 continue;
318 /*
319 * Since we control both RSS and cache, we end up with a
320 * very interesting scenario where we end up reclaiming
321 * memory (essentially RSS), since the memory is pushed
322 * to swap cache, we eventually end up adding those
323 * pages back to our list. Hence we give ourselves a
324 * few chances before we fail
325 */
326 else if (nr_retries--) {
327 congestion_wait(WRITE, HZ/10);
328 continue;
329 }
330
201 css_put(&mem->css); 331 css_put(&mem->css);
202 goto free_pc; 332 goto free_pc;
203 } 333 }
@@ -221,14 +351,16 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
221 pc->page = page; 351 pc->page = page;
222 page_assign_page_cgroup(page, pc); 352 page_assign_page_cgroup(page, pc);
223 353
354 spin_lock_irqsave(&mem->lru_lock, flags);
355 list_add(&pc->lru, &mem->active_list);
356 spin_unlock_irqrestore(&mem->lru_lock, flags);
357
224done: 358done:
225 unlock_page_cgroup(page); 359 unlock_page_cgroup(page);
226 return 0; 360 return 0;
227free_pc: 361free_pc:
228 kfree(pc); 362 kfree(pc);
229 return -ENOMEM;
230err: 363err:
231 unlock_page_cgroup(page);
232 return -ENOMEM; 364 return -ENOMEM;
233} 365}
234 366
@@ -240,6 +372,7 @@ void mem_cgroup_uncharge(struct page_cgroup *pc)
240{ 372{
241 struct mem_cgroup *mem; 373 struct mem_cgroup *mem;
242 struct page *page; 374 struct page *page;
375 unsigned long flags;
243 376
244 if (!pc) 377 if (!pc)
245 return; 378 return;
@@ -252,6 +385,10 @@ void mem_cgroup_uncharge(struct page_cgroup *pc)
252 page_assign_page_cgroup(page, NULL); 385 page_assign_page_cgroup(page, NULL);
253 unlock_page_cgroup(page); 386 unlock_page_cgroup(page);
254 res_counter_uncharge(&mem->res, 1); 387 res_counter_uncharge(&mem->res, 1);
388
389 spin_lock_irqsave(&mem->lru_lock, flags);
390 list_del_init(&pc->lru);
391 spin_unlock_irqrestore(&mem->lru_lock, flags);
255 kfree(pc); 392 kfree(pc);
256 } 393 }
257} 394}
@@ -310,6 +447,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
310 res_counter_init(&mem->res); 447 res_counter_init(&mem->res);
311 INIT_LIST_HEAD(&mem->active_list); 448 INIT_LIST_HEAD(&mem->active_list);
312 INIT_LIST_HEAD(&mem->inactive_list); 449 INIT_LIST_HEAD(&mem->inactive_list);
450 spin_lock_init(&mem->lru_lock);
313 return &mem->css; 451 return &mem->css;
314} 452}
315 453
diff --git a/mm/swap.c b/mm/swap.c
index 57b7e25a939c..710a20bb9749 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -29,6 +29,7 @@
29#include <linux/cpu.h> 29#include <linux/cpu.h>
30#include <linux/notifier.h> 30#include <linux/notifier.h>
31#include <linux/backing-dev.h> 31#include <linux/backing-dev.h>
32#include <linux/memcontrol.h>
32 33
33/* How many pages do we try to swap or page in/out together? */ 34/* How many pages do we try to swap or page in/out together? */
34int page_cluster; 35int page_cluster;
@@ -175,6 +176,7 @@ void activate_page(struct page *page)
175 SetPageActive(page); 176 SetPageActive(page);
176 add_page_to_active_list(zone, page); 177 add_page_to_active_list(zone, page);
177 __count_vm_event(PGACTIVATE); 178 __count_vm_event(PGACTIVATE);
179 mem_cgroup_move_lists(page_get_page_cgroup(page), true);
178 } 180 }
179 spin_unlock_irq(&zone->lru_lock); 181 spin_unlock_irq(&zone->lru_lock);
180} 182}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5a9597e3bbc..7408a8a7d882 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -37,6 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/kthread.h> 38#include <linux/kthread.h>
39#include <linux/freezer.h> 39#include <linux/freezer.h>
40#include <linux/memcontrol.h>
40 41
41#include <asm/tlbflush.h> 42#include <asm/tlbflush.h>
42#include <asm/div64.h> 43#include <asm/div64.h>
@@ -68,6 +69,15 @@ struct scan_control {
68 int all_unreclaimable; 69 int all_unreclaimable;
69 70
70 int order; 71 int order;
72
73 /* Which cgroup do we reclaim from */
74 struct mem_cgroup *mem_cgroup;
75
76 /* Pluggable isolate pages callback */
77 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
78 unsigned long *scanned, int order, int mode,
79 struct zone *z, struct mem_cgroup *mem_cont,
80 int active);
71}; 81};
72 82
73#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 83#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -626,7 +636,7 @@ keep:
626 * 636 *
627 * returns 0 on success, -ve errno on failure. 637 * returns 0 on success, -ve errno on failure.
628 */ 638 */
629static int __isolate_lru_page(struct page *page, int mode) 639int __isolate_lru_page(struct page *page, int mode)
630{ 640{
631 int ret = -EINVAL; 641 int ret = -EINVAL;
632 642
@@ -760,6 +770,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
760 return nr_taken; 770 return nr_taken;
761} 771}
762 772
773static unsigned long isolate_pages_global(unsigned long nr,
774 struct list_head *dst,
775 unsigned long *scanned, int order,
776 int mode, struct zone *z,
777 struct mem_cgroup *mem_cont,
778 int active)
779{
780 if (active)
781 return isolate_lru_pages(nr, &z->active_list, dst,
782 scanned, order, mode);
783 else
784 return isolate_lru_pages(nr, &z->inactive_list, dst,
785 scanned, order, mode);
786}
787
763/* 788/*
764 * clear_active_flags() is a helper for shrink_active_list(), clearing 789 * clear_active_flags() is a helper for shrink_active_list(), clearing
765 * any active bits from the pages in the list. 790 * any active bits from the pages in the list.
@@ -801,11 +826,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
801 unsigned long nr_freed; 826 unsigned long nr_freed;
802 unsigned long nr_active; 827 unsigned long nr_active;
803 828
804 nr_taken = isolate_lru_pages(sc->swap_cluster_max, 829 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
805 &zone->inactive_list,
806 &page_list, &nr_scan, sc->order, 830 &page_list, &nr_scan, sc->order,
807 (sc->order > PAGE_ALLOC_COSTLY_ORDER)? 831 (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
808 ISOLATE_BOTH : ISOLATE_INACTIVE); 832 ISOLATE_BOTH : ISOLATE_INACTIVE,
833 zone, sc->mem_cgroup, 0);
809 nr_active = clear_active_flags(&page_list); 834 nr_active = clear_active_flags(&page_list);
810 __count_vm_events(PGDEACTIVATE, nr_active); 835 __count_vm_events(PGDEACTIVATE, nr_active);
811 836
@@ -1018,8 +1043,9 @@ force_reclaim_mapped:
1018 1043
1019 lru_add_drain(); 1044 lru_add_drain();
1020 spin_lock_irq(&zone->lru_lock); 1045 spin_lock_irq(&zone->lru_lock);
1021 pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, 1046 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1022 &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE); 1047 ISOLATE_ACTIVE, zone,
1048 sc->mem_cgroup, 1);
1023 zone->pages_scanned += pgscanned; 1049 zone->pages_scanned += pgscanned;
1024 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); 1050 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
1025 spin_unlock_irq(&zone->lru_lock); 1051 spin_unlock_irq(&zone->lru_lock);
@@ -1051,6 +1077,7 @@ force_reclaim_mapped:
1051 ClearPageActive(page); 1077 ClearPageActive(page);
1052 1078
1053 list_move(&page->lru, &zone->inactive_list); 1079 list_move(&page->lru, &zone->inactive_list);
1080 mem_cgroup_move_lists(page_get_page_cgroup(page), false);
1054 pgmoved++; 1081 pgmoved++;
1055 if (!pagevec_add(&pvec, page)) { 1082 if (!pagevec_add(&pvec, page)) {
1056 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 1083 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
@@ -1079,6 +1106,7 @@ force_reclaim_mapped:
1079 SetPageLRU(page); 1106 SetPageLRU(page);
1080 VM_BUG_ON(!PageActive(page)); 1107 VM_BUG_ON(!PageActive(page));
1081 list_move(&page->lru, &zone->active_list); 1108 list_move(&page->lru, &zone->active_list);
1109 mem_cgroup_move_lists(page_get_page_cgroup(page), true);
1082 pgmoved++; 1110 pgmoved++;
1083 if (!pagevec_add(&pvec, page)) { 1111 if (!pagevec_add(&pvec, page)) {
1084 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); 1112 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
@@ -1206,7 +1234,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
1206 * holds filesystem locks which prevent writeout this might not work, and the 1234 * holds filesystem locks which prevent writeout this might not work, and the
1207 * allocation attempt will fail. 1235 * allocation attempt will fail.
1208 */ 1236 */
1209unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) 1237static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
1238 struct scan_control *sc)
1210{ 1239{
1211 int priority; 1240 int priority;
1212 int ret = 0; 1241 int ret = 0;
@@ -1215,14 +1244,6 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
1215 struct reclaim_state *reclaim_state = current->reclaim_state; 1244 struct reclaim_state *reclaim_state = current->reclaim_state;
1216 unsigned long lru_pages = 0; 1245 unsigned long lru_pages = 0;
1217 int i; 1246 int i;
1218 struct scan_control sc = {
1219 .gfp_mask = gfp_mask,
1220 .may_writepage = !laptop_mode,
1221 .swap_cluster_max = SWAP_CLUSTER_MAX,
1222 .may_swap = 1,
1223 .swappiness = vm_swappiness,
1224 .order = order,
1225 };
1226 1247
1227 count_vm_event(ALLOCSTALL); 1248 count_vm_event(ALLOCSTALL);
1228 1249
@@ -1237,17 +1258,22 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
1237 } 1258 }
1238 1259
1239 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1260 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1240 sc.nr_scanned = 0; 1261 sc->nr_scanned = 0;
1241 if (!priority) 1262 if (!priority)
1242 disable_swap_token(); 1263 disable_swap_token();
1243 nr_reclaimed += shrink_zones(priority, zones, &sc); 1264 nr_reclaimed += shrink_zones(priority, zones, sc);
1244 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); 1265 /*
1266 * Don't shrink slabs when reclaiming memory from
1267 * over limit cgroups
1268 */
1269 if (sc->mem_cgroup == NULL)
1270 shrink_slab(sc->nr_scanned, gfp_mask, lru_pages);
1245 if (reclaim_state) { 1271 if (reclaim_state) {
1246 nr_reclaimed += reclaim_state->reclaimed_slab; 1272 nr_reclaimed += reclaim_state->reclaimed_slab;
1247 reclaim_state->reclaimed_slab = 0; 1273 reclaim_state->reclaimed_slab = 0;
1248 } 1274 }
1249 total_scanned += sc.nr_scanned; 1275 total_scanned += sc->nr_scanned;
1250 if (nr_reclaimed >= sc.swap_cluster_max) { 1276 if (nr_reclaimed >= sc->swap_cluster_max) {
1251 ret = 1; 1277 ret = 1;
1252 goto out; 1278 goto out;
1253 } 1279 }
@@ -1259,18 +1285,18 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
1259 * that's undesirable in laptop mode, where we *want* lumpy 1285 * that's undesirable in laptop mode, where we *want* lumpy
1260 * writeout. So in laptop mode, write out the whole world. 1286 * writeout. So in laptop mode, write out the whole world.
1261 */ 1287 */
1262 if (total_scanned > sc.swap_cluster_max + 1288 if (total_scanned > sc->swap_cluster_max +
1263 sc.swap_cluster_max / 2) { 1289 sc->swap_cluster_max / 2) {
1264 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1290 wakeup_pdflush(laptop_mode ? 0 : total_scanned);
1265 sc.may_writepage = 1; 1291 sc->may_writepage = 1;
1266 } 1292 }
1267 1293
1268 /* Take a nap, wait for some writeback to complete */ 1294 /* Take a nap, wait for some writeback to complete */
1269 if (sc.nr_scanned && priority < DEF_PRIORITY - 2) 1295 if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
1270 congestion_wait(WRITE, HZ/10); 1296 congestion_wait(WRITE, HZ/10);
1271 } 1297 }
1272 /* top priority shrink_caches still had more to do? don't OOM, then */ 1298 /* top priority shrink_caches still had more to do? don't OOM, then */
1273 if (!sc.all_unreclaimable) 1299 if (!sc->all_unreclaimable && sc->mem_cgroup == NULL)
1274 ret = 1; 1300 ret = 1;
1275out: 1301out:
1276 /* 1302 /*
@@ -1293,6 +1319,54 @@ out:
1293 return ret; 1319 return ret;
1294} 1320}
1295 1321
1322unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
1323{
1324 struct scan_control sc = {
1325 .gfp_mask = gfp_mask,
1326 .may_writepage = !laptop_mode,
1327 .swap_cluster_max = SWAP_CLUSTER_MAX,
1328 .may_swap = 1,
1329 .swappiness = vm_swappiness,
1330 .order = order,
1331 .mem_cgroup = NULL,
1332 .isolate_pages = isolate_pages_global,
1333 };
1334
1335 return do_try_to_free_pages(zones, gfp_mask, &sc);
1336}
1337
1338#ifdef CONFIG_CGROUP_MEM_CONT
1339
1340#ifdef CONFIG_HIGHMEM
1341#define ZONE_USERPAGES ZONE_HIGHMEM
1342#else
1343#define ZONE_USERPAGES ZONE_NORMAL
1344#endif
1345
1346unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont)
1347{
1348 struct scan_control sc = {
1349 .gfp_mask = GFP_KERNEL,
1350 .may_writepage = !laptop_mode,
1351 .may_swap = 1,
1352 .swap_cluster_max = SWAP_CLUSTER_MAX,
1353 .swappiness = vm_swappiness,
1354 .order = 0,
1355 .mem_cgroup = mem_cont,
1356 .isolate_pages = mem_cgroup_isolate_pages,
1357 };
1358 int node;
1359 struct zone **zones;
1360
1361 for_each_online_node(node) {
1362 zones = NODE_DATA(node)->node_zonelists[ZONE_USERPAGES].zones;
1363 if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
1364 return 1;
1365 }
1366 return 0;
1367}
1368#endif
1369
1296/* 1370/*
1297 * For kswapd, balance_pgdat() will work across all this node's zones until 1371 * For kswapd, balance_pgdat() will work across all this node's zones until
1298 * they are all at pages_high. 1372 * they are all at pages_high.
@@ -1328,6 +1402,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1328 .swap_cluster_max = SWAP_CLUSTER_MAX, 1402 .swap_cluster_max = SWAP_CLUSTER_MAX,
1329 .swappiness = vm_swappiness, 1403 .swappiness = vm_swappiness,
1330 .order = order, 1404 .order = order,
1405 .mem_cgroup = NULL,
1406 .isolate_pages = isolate_pages_global,
1331 }; 1407 };
1332 /* 1408 /*
1333 * temp_priority is used to remember the scanning priority at which 1409 * temp_priority is used to remember the scanning priority at which
@@ -1649,6 +1725,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1649 .swap_cluster_max = nr_pages, 1725 .swap_cluster_max = nr_pages,
1650 .may_writepage = 1, 1726 .may_writepage = 1,
1651 .swappiness = vm_swappiness, 1727 .swappiness = vm_swappiness,
1728 .isolate_pages = isolate_pages_global,
1652 }; 1729 };
1653 1730
1654 current->reclaim_state = &reclaim_state; 1731 current->reclaim_state = &reclaim_state;
@@ -1834,6 +1911,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1834 SWAP_CLUSTER_MAX), 1911 SWAP_CLUSTER_MAX),
1835 .gfp_mask = gfp_mask, 1912 .gfp_mask = gfp_mask,
1836 .swappiness = vm_swappiness, 1913 .swappiness = vm_swappiness,
1914 .isolate_pages = isolate_pages_global,
1837 }; 1915 };
1838 unsigned long slab_reclaimable; 1916 unsigned long slab_reclaimable;
1839 1917