diff options
author | Balbir Singh <balbir@linux.vnet.ibm.com> | 2008-02-07 03:13:56 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2008-02-07 11:42:18 -0500 |
commit | 66e1707bc34609f626e2e7b4fe7e454c9748bad5 (patch) | |
tree | d850a729887485874c976ba64eb85e3406e488a1 | |
parent | 67e465a77ba658635309ee00b367bec6555ea544 (diff) |
Memory controller: add per cgroup LRU and reclaim
Add the page_cgroup to the per cgroup LRU. The reclaim algorithm has
been modified to make the isolate_lru_pages() as a pluggable component. The
scan_control data structure now accepts the cgroup on behalf of which
reclaims are carried out. try_to_free_pages() has been extended to become
cgroup aware.
[akpm@linux-foundation.org: fix warning]
[Lee.Schermerhorn@hp.com: initialize all scan_control's isolate_pages member]
[bunk@kernel.org: make do_try_to_free_pages() static]
[hugh@veritas.com: memcgroup: fix try_to_free order]
[kamezawa.hiroyu@jp.fujitsu.com: this unlock_page_cgroup() is unnecessary]
Signed-off-by: Pavel Emelianov <xemul@openvz.org>
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Paul Menage <menage@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Kirill Korotaev <dev@sw.ru>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: David Rientjes <rientjes@google.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/memcontrol.h | 12 | ||||
-rw-r--r-- | include/linux/res_counter.h | 23 | ||||
-rw-r--r-- | include/linux/swap.h | 3 | ||||
-rw-r--r-- | mm/memcontrol.c | 148 | ||||
-rw-r--r-- | mm/swap.c | 2 | ||||
-rw-r--r-- | mm/vmscan.c | 128 |
6 files changed, 286 insertions, 30 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f5b47efab48b..9c3c1c97c197 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -32,6 +32,13 @@ extern void page_assign_page_cgroup(struct page *page, | |||
32 | extern struct page_cgroup *page_get_page_cgroup(struct page *page); | 32 | extern struct page_cgroup *page_get_page_cgroup(struct page *page); |
33 | extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm); | 33 | extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm); |
34 | extern void mem_cgroup_uncharge(struct page_cgroup *pc); | 34 | extern void mem_cgroup_uncharge(struct page_cgroup *pc); |
35 | extern void mem_cgroup_move_lists(struct page_cgroup *pc, bool active); | ||
36 | extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | ||
37 | struct list_head *dst, | ||
38 | unsigned long *scanned, int order, | ||
39 | int mode, struct zone *z, | ||
40 | struct mem_cgroup *mem_cont, | ||
41 | int active); | ||
35 | 42 | ||
36 | static inline void mem_cgroup_uncharge_page(struct page *page) | 43 | static inline void mem_cgroup_uncharge_page(struct page *page) |
37 | { | 44 | { |
@@ -71,6 +78,11 @@ static inline void mem_cgroup_uncharge_page(struct page *page) | |||
71 | { | 78 | { |
72 | } | 79 | } |
73 | 80 | ||
81 | static inline void mem_cgroup_move_lists(struct page_cgroup *pc, | ||
82 | bool active) | ||
83 | { | ||
84 | } | ||
85 | |||
74 | #endif /* CONFIG_CGROUP_MEM_CONT */ | 86 | #endif /* CONFIG_CGROUP_MEM_CONT */ |
75 | 87 | ||
76 | #endif /* _LINUX_MEMCONTROL_H */ | 88 | #endif /* _LINUX_MEMCONTROL_H */ |
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index eeb3f7749772..5e60a4f34243 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h | |||
@@ -99,4 +99,27 @@ int res_counter_charge(struct res_counter *counter, unsigned long val); | |||
99 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); | 99 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); |
100 | void res_counter_uncharge(struct res_counter *counter, unsigned long val); | 100 | void res_counter_uncharge(struct res_counter *counter, unsigned long val); |
101 | 101 | ||
102 | static inline bool res_counter_limit_check_locked(struct res_counter *cnt) | ||
103 | { | ||
104 | if (cnt->usage < cnt->limit) | ||
105 | return true; | ||
106 | |||
107 | return false; | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * Helper function to detect if the cgroup is within it's limit or | ||
112 | * not. It's currently called from cgroup_rss_prepare() | ||
113 | */ | ||
114 | static inline bool res_counter_check_under_limit(struct res_counter *cnt) | ||
115 | { | ||
116 | bool ret; | ||
117 | unsigned long flags; | ||
118 | |||
119 | spin_lock_irqsave(&cnt->lock, flags); | ||
120 | ret = res_counter_limit_check_locked(cnt); | ||
121 | spin_unlock_irqrestore(&cnt->lock, flags); | ||
122 | return ret; | ||
123 | } | ||
124 | |||
102 | #endif | 125 | #endif |
diff --git a/include/linux/swap.h b/include/linux/swap.h index 353153ea0bd5..4d91bc0e0fd5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -5,6 +5,7 @@ | |||
5 | #include <linux/linkage.h> | 5 | #include <linux/linkage.h> |
6 | #include <linux/mmzone.h> | 6 | #include <linux/mmzone.h> |
7 | #include <linux/list.h> | 7 | #include <linux/list.h> |
8 | #include <linux/memcontrol.h> | ||
8 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
9 | 10 | ||
10 | #include <asm/atomic.h> | 11 | #include <asm/atomic.h> |
@@ -182,6 +183,8 @@ extern void swap_setup(void); | |||
182 | /* linux/mm/vmscan.c */ | 183 | /* linux/mm/vmscan.c */ |
183 | extern unsigned long try_to_free_pages(struct zone **zones, int order, | 184 | extern unsigned long try_to_free_pages(struct zone **zones, int order, |
184 | gfp_t gfp_mask); | 185 | gfp_t gfp_mask); |
186 | extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem); | ||
187 | extern int __isolate_lru_page(struct page *page, int mode); | ||
185 | extern unsigned long shrink_all_memory(unsigned long nr_pages); | 188 | extern unsigned long shrink_all_memory(unsigned long nr_pages); |
186 | extern int vm_swappiness; | 189 | extern int vm_swappiness; |
187 | extern int remove_mapping(struct address_space *mapping, struct page *page); | 190 | extern int remove_mapping(struct address_space *mapping, struct page *page); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b25df2a9d024..9e9ff914c0f1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -22,10 +22,15 @@ | |||
22 | #include <linux/cgroup.h> | 22 | #include <linux/cgroup.h> |
23 | #include <linux/mm.h> | 23 | #include <linux/mm.h> |
24 | #include <linux/page-flags.h> | 24 | #include <linux/page-flags.h> |
25 | #include <linux/backing-dev.h> | ||
25 | #include <linux/bit_spinlock.h> | 26 | #include <linux/bit_spinlock.h> |
26 | #include <linux/rcupdate.h> | 27 | #include <linux/rcupdate.h> |
28 | #include <linux/swap.h> | ||
29 | #include <linux/spinlock.h> | ||
30 | #include <linux/fs.h> | ||
27 | 31 | ||
28 | struct cgroup_subsys mem_cgroup_subsys; | 32 | struct cgroup_subsys mem_cgroup_subsys; |
33 | static const int MEM_CGROUP_RECLAIM_RETRIES = 5; | ||
29 | 34 | ||
30 | /* | 35 | /* |
31 | * The memory controller data structure. The memory controller controls both | 36 | * The memory controller data structure. The memory controller controls both |
@@ -51,6 +56,10 @@ struct mem_cgroup { | |||
51 | */ | 56 | */ |
52 | struct list_head active_list; | 57 | struct list_head active_list; |
53 | struct list_head inactive_list; | 58 | struct list_head inactive_list; |
59 | /* | ||
60 | * spin_lock to protect the per cgroup LRU | ||
61 | */ | ||
62 | spinlock_t lru_lock; | ||
54 | }; | 63 | }; |
55 | 64 | ||
56 | /* | 65 | /* |
@@ -141,6 +150,94 @@ void __always_inline unlock_page_cgroup(struct page *page) | |||
141 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | 150 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); |
142 | } | 151 | } |
143 | 152 | ||
153 | void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) | ||
154 | { | ||
155 | if (active) | ||
156 | list_move(&pc->lru, &pc->mem_cgroup->active_list); | ||
157 | else | ||
158 | list_move(&pc->lru, &pc->mem_cgroup->inactive_list); | ||
159 | } | ||
160 | |||
161 | /* | ||
162 | * This routine assumes that the appropriate zone's lru lock is already held | ||
163 | */ | ||
164 | void mem_cgroup_move_lists(struct page_cgroup *pc, bool active) | ||
165 | { | ||
166 | struct mem_cgroup *mem; | ||
167 | if (!pc) | ||
168 | return; | ||
169 | |||
170 | mem = pc->mem_cgroup; | ||
171 | |||
172 | spin_lock(&mem->lru_lock); | ||
173 | __mem_cgroup_move_lists(pc, active); | ||
174 | spin_unlock(&mem->lru_lock); | ||
175 | } | ||
176 | |||
177 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | ||
178 | struct list_head *dst, | ||
179 | unsigned long *scanned, int order, | ||
180 | int mode, struct zone *z, | ||
181 | struct mem_cgroup *mem_cont, | ||
182 | int active) | ||
183 | { | ||
184 | unsigned long nr_taken = 0; | ||
185 | struct page *page; | ||
186 | unsigned long scan; | ||
187 | LIST_HEAD(pc_list); | ||
188 | struct list_head *src; | ||
189 | struct page_cgroup *pc; | ||
190 | |||
191 | if (active) | ||
192 | src = &mem_cont->active_list; | ||
193 | else | ||
194 | src = &mem_cont->inactive_list; | ||
195 | |||
196 | spin_lock(&mem_cont->lru_lock); | ||
197 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { | ||
198 | pc = list_entry(src->prev, struct page_cgroup, lru); | ||
199 | page = pc->page; | ||
200 | VM_BUG_ON(!pc); | ||
201 | |||
202 | if (PageActive(page) && !active) { | ||
203 | __mem_cgroup_move_lists(pc, true); | ||
204 | scan--; | ||
205 | continue; | ||
206 | } | ||
207 | if (!PageActive(page) && active) { | ||
208 | __mem_cgroup_move_lists(pc, false); | ||
209 | scan--; | ||
210 | continue; | ||
211 | } | ||
212 | |||
213 | /* | ||
214 | * Reclaim, per zone | ||
215 | * TODO: make the active/inactive lists per zone | ||
216 | */ | ||
217 | if (page_zone(page) != z) | ||
218 | continue; | ||
219 | |||
220 | /* | ||
221 | * Check if the meta page went away from under us | ||
222 | */ | ||
223 | if (!list_empty(&pc->lru)) | ||
224 | list_move(&pc->lru, &pc_list); | ||
225 | else | ||
226 | continue; | ||
227 | |||
228 | if (__isolate_lru_page(page, mode) == 0) { | ||
229 | list_move(&page->lru, dst); | ||
230 | nr_taken++; | ||
231 | } | ||
232 | } | ||
233 | |||
234 | list_splice(&pc_list, src); | ||
235 | spin_unlock(&mem_cont->lru_lock); | ||
236 | |||
237 | *scanned = scan; | ||
238 | return nr_taken; | ||
239 | } | ||
240 | |||
144 | /* | 241 | /* |
145 | * Charge the memory controller for page usage. | 242 | * Charge the memory controller for page usage. |
146 | * Return | 243 | * Return |
@@ -151,6 +248,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) | |||
151 | { | 248 | { |
152 | struct mem_cgroup *mem; | 249 | struct mem_cgroup *mem; |
153 | struct page_cgroup *pc, *race_pc; | 250 | struct page_cgroup *pc, *race_pc; |
251 | unsigned long flags; | ||
252 | unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
154 | 253 | ||
155 | /* | 254 | /* |
156 | * Should page_cgroup's go to their own slab? | 255 | * Should page_cgroup's go to their own slab? |
@@ -159,14 +258,20 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) | |||
159 | * to see if the cgroup page already has a page_cgroup associated | 258 | * to see if the cgroup page already has a page_cgroup associated |
160 | * with it | 259 | * with it |
161 | */ | 260 | */ |
261 | retry: | ||
162 | lock_page_cgroup(page); | 262 | lock_page_cgroup(page); |
163 | pc = page_get_page_cgroup(page); | 263 | pc = page_get_page_cgroup(page); |
164 | /* | 264 | /* |
165 | * The page_cgroup exists and the page has already been accounted | 265 | * The page_cgroup exists and the page has already been accounted |
166 | */ | 266 | */ |
167 | if (pc) { | 267 | if (pc) { |
168 | atomic_inc(&pc->ref_cnt); | 268 | if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) { |
169 | goto done; | 269 | /* this page is under being uncharged ? */ |
270 | unlock_page_cgroup(page); | ||
271 | cpu_relax(); | ||
272 | goto retry; | ||
273 | } else | ||
274 | goto done; | ||
170 | } | 275 | } |
171 | 276 | ||
172 | unlock_page_cgroup(page); | 277 | unlock_page_cgroup(page); |
@@ -197,7 +302,32 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) | |||
197 | * If we created the page_cgroup, we should free it on exceeding | 302 | * If we created the page_cgroup, we should free it on exceeding |
198 | * the cgroup limit. | 303 | * the cgroup limit. |
199 | */ | 304 | */ |
200 | if (res_counter_charge(&mem->res, 1)) { | 305 | while (res_counter_charge(&mem->res, 1)) { |
306 | if (try_to_free_mem_cgroup_pages(mem)) | ||
307 | continue; | ||
308 | |||
309 | /* | ||
310 | * try_to_free_mem_cgroup_pages() might not give us a full | ||
311 | * picture of reclaim. Some pages are reclaimed and might be | ||
312 | * moved to swap cache or just unmapped from the cgroup. | ||
313 | * Check the limit again to see if the reclaim reduced the | ||
314 | * current usage of the cgroup before giving up | ||
315 | */ | ||
316 | if (res_counter_check_under_limit(&mem->res)) | ||
317 | continue; | ||
318 | /* | ||
319 | * Since we control both RSS and cache, we end up with a | ||
320 | * very interesting scenario where we end up reclaiming | ||
321 | * memory (essentially RSS), since the memory is pushed | ||
322 | * to swap cache, we eventually end up adding those | ||
323 | * pages back to our list. Hence we give ourselves a | ||
324 | * few chances before we fail | ||
325 | */ | ||
326 | else if (nr_retries--) { | ||
327 | congestion_wait(WRITE, HZ/10); | ||
328 | continue; | ||
329 | } | ||
330 | |||
201 | css_put(&mem->css); | 331 | css_put(&mem->css); |
202 | goto free_pc; | 332 | goto free_pc; |
203 | } | 333 | } |
@@ -221,14 +351,16 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) | |||
221 | pc->page = page; | 351 | pc->page = page; |
222 | page_assign_page_cgroup(page, pc); | 352 | page_assign_page_cgroup(page, pc); |
223 | 353 | ||
354 | spin_lock_irqsave(&mem->lru_lock, flags); | ||
355 | list_add(&pc->lru, &mem->active_list); | ||
356 | spin_unlock_irqrestore(&mem->lru_lock, flags); | ||
357 | |||
224 | done: | 358 | done: |
225 | unlock_page_cgroup(page); | 359 | unlock_page_cgroup(page); |
226 | return 0; | 360 | return 0; |
227 | free_pc: | 361 | free_pc: |
228 | kfree(pc); | 362 | kfree(pc); |
229 | return -ENOMEM; | ||
230 | err: | 363 | err: |
231 | unlock_page_cgroup(page); | ||
232 | return -ENOMEM; | 364 | return -ENOMEM; |
233 | } | 365 | } |
234 | 366 | ||
@@ -240,6 +372,7 @@ void mem_cgroup_uncharge(struct page_cgroup *pc) | |||
240 | { | 372 | { |
241 | struct mem_cgroup *mem; | 373 | struct mem_cgroup *mem; |
242 | struct page *page; | 374 | struct page *page; |
375 | unsigned long flags; | ||
243 | 376 | ||
244 | if (!pc) | 377 | if (!pc) |
245 | return; | 378 | return; |
@@ -252,6 +385,10 @@ void mem_cgroup_uncharge(struct page_cgroup *pc) | |||
252 | page_assign_page_cgroup(page, NULL); | 385 | page_assign_page_cgroup(page, NULL); |
253 | unlock_page_cgroup(page); | 386 | unlock_page_cgroup(page); |
254 | res_counter_uncharge(&mem->res, 1); | 387 | res_counter_uncharge(&mem->res, 1); |
388 | |||
389 | spin_lock_irqsave(&mem->lru_lock, flags); | ||
390 | list_del_init(&pc->lru); | ||
391 | spin_unlock_irqrestore(&mem->lru_lock, flags); | ||
255 | kfree(pc); | 392 | kfree(pc); |
256 | } | 393 | } |
257 | } | 394 | } |
@@ -310,6 +447,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
310 | res_counter_init(&mem->res); | 447 | res_counter_init(&mem->res); |
311 | INIT_LIST_HEAD(&mem->active_list); | 448 | INIT_LIST_HEAD(&mem->active_list); |
312 | INIT_LIST_HEAD(&mem->inactive_list); | 449 | INIT_LIST_HEAD(&mem->inactive_list); |
450 | spin_lock_init(&mem->lru_lock); | ||
313 | return &mem->css; | 451 | return &mem->css; |
314 | } | 452 | } |
315 | 453 | ||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/cpu.h> | 29 | #include <linux/cpu.h> |
30 | #include <linux/notifier.h> | 30 | #include <linux/notifier.h> |
31 | #include <linux/backing-dev.h> | 31 | #include <linux/backing-dev.h> |
32 | #include <linux/memcontrol.h> | ||
32 | 33 | ||
33 | /* How many pages do we try to swap or page in/out together? */ | 34 | /* How many pages do we try to swap or page in/out together? */ |
34 | int page_cluster; | 35 | int page_cluster; |
@@ -175,6 +176,7 @@ void activate_page(struct page *page) | |||
175 | SetPageActive(page); | 176 | SetPageActive(page); |
176 | add_page_to_active_list(zone, page); | 177 | add_page_to_active_list(zone, page); |
177 | __count_vm_event(PGACTIVATE); | 178 | __count_vm_event(PGACTIVATE); |
179 | mem_cgroup_move_lists(page_get_page_cgroup(page), true); | ||
178 | } | 180 | } |
179 | spin_unlock_irq(&zone->lru_lock); | 181 | spin_unlock_irq(&zone->lru_lock); |
180 | } | 182 | } |
diff --git a/mm/vmscan.c b/mm/vmscan.c index e5a9597e3bbc..7408a8a7d882 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/delay.h> | 37 | #include <linux/delay.h> |
38 | #include <linux/kthread.h> | 38 | #include <linux/kthread.h> |
39 | #include <linux/freezer.h> | 39 | #include <linux/freezer.h> |
40 | #include <linux/memcontrol.h> | ||
40 | 41 | ||
41 | #include <asm/tlbflush.h> | 42 | #include <asm/tlbflush.h> |
42 | #include <asm/div64.h> | 43 | #include <asm/div64.h> |
@@ -68,6 +69,15 @@ struct scan_control { | |||
68 | int all_unreclaimable; | 69 | int all_unreclaimable; |
69 | 70 | ||
70 | int order; | 71 | int order; |
72 | |||
73 | /* Which cgroup do we reclaim from */ | ||
74 | struct mem_cgroup *mem_cgroup; | ||
75 | |||
76 | /* Pluggable isolate pages callback */ | ||
77 | unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, | ||
78 | unsigned long *scanned, int order, int mode, | ||
79 | struct zone *z, struct mem_cgroup *mem_cont, | ||
80 | int active); | ||
71 | }; | 81 | }; |
72 | 82 | ||
73 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 83 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
@@ -626,7 +636,7 @@ keep: | |||
626 | * | 636 | * |
627 | * returns 0 on success, -ve errno on failure. | 637 | * returns 0 on success, -ve errno on failure. |
628 | */ | 638 | */ |
629 | static int __isolate_lru_page(struct page *page, int mode) | 639 | int __isolate_lru_page(struct page *page, int mode) |
630 | { | 640 | { |
631 | int ret = -EINVAL; | 641 | int ret = -EINVAL; |
632 | 642 | ||
@@ -760,6 +770,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
760 | return nr_taken; | 770 | return nr_taken; |
761 | } | 771 | } |
762 | 772 | ||
773 | static unsigned long isolate_pages_global(unsigned long nr, | ||
774 | struct list_head *dst, | ||
775 | unsigned long *scanned, int order, | ||
776 | int mode, struct zone *z, | ||
777 | struct mem_cgroup *mem_cont, | ||
778 | int active) | ||
779 | { | ||
780 | if (active) | ||
781 | return isolate_lru_pages(nr, &z->active_list, dst, | ||
782 | scanned, order, mode); | ||
783 | else | ||
784 | return isolate_lru_pages(nr, &z->inactive_list, dst, | ||
785 | scanned, order, mode); | ||
786 | } | ||
787 | |||
763 | /* | 788 | /* |
764 | * clear_active_flags() is a helper for shrink_active_list(), clearing | 789 | * clear_active_flags() is a helper for shrink_active_list(), clearing |
765 | * any active bits from the pages in the list. | 790 | * any active bits from the pages in the list. |
@@ -801,11 +826,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
801 | unsigned long nr_freed; | 826 | unsigned long nr_freed; |
802 | unsigned long nr_active; | 827 | unsigned long nr_active; |
803 | 828 | ||
804 | nr_taken = isolate_lru_pages(sc->swap_cluster_max, | 829 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, |
805 | &zone->inactive_list, | ||
806 | &page_list, &nr_scan, sc->order, | 830 | &page_list, &nr_scan, sc->order, |
807 | (sc->order > PAGE_ALLOC_COSTLY_ORDER)? | 831 | (sc->order > PAGE_ALLOC_COSTLY_ORDER)? |
808 | ISOLATE_BOTH : ISOLATE_INACTIVE); | 832 | ISOLATE_BOTH : ISOLATE_INACTIVE, |
833 | zone, sc->mem_cgroup, 0); | ||
809 | nr_active = clear_active_flags(&page_list); | 834 | nr_active = clear_active_flags(&page_list); |
810 | __count_vm_events(PGDEACTIVATE, nr_active); | 835 | __count_vm_events(PGDEACTIVATE, nr_active); |
811 | 836 | ||
@@ -1018,8 +1043,9 @@ force_reclaim_mapped: | |||
1018 | 1043 | ||
1019 | lru_add_drain(); | 1044 | lru_add_drain(); |
1020 | spin_lock_irq(&zone->lru_lock); | 1045 | spin_lock_irq(&zone->lru_lock); |
1021 | pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, | 1046 | pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, |
1022 | &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE); | 1047 | ISOLATE_ACTIVE, zone, |
1048 | sc->mem_cgroup, 1); | ||
1023 | zone->pages_scanned += pgscanned; | 1049 | zone->pages_scanned += pgscanned; |
1024 | __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); | 1050 | __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); |
1025 | spin_unlock_irq(&zone->lru_lock); | 1051 | spin_unlock_irq(&zone->lru_lock); |
@@ -1051,6 +1077,7 @@ force_reclaim_mapped: | |||
1051 | ClearPageActive(page); | 1077 | ClearPageActive(page); |
1052 | 1078 | ||
1053 | list_move(&page->lru, &zone->inactive_list); | 1079 | list_move(&page->lru, &zone->inactive_list); |
1080 | mem_cgroup_move_lists(page_get_page_cgroup(page), false); | ||
1054 | pgmoved++; | 1081 | pgmoved++; |
1055 | if (!pagevec_add(&pvec, page)) { | 1082 | if (!pagevec_add(&pvec, page)) { |
1056 | __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); | 1083 | __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); |
@@ -1079,6 +1106,7 @@ force_reclaim_mapped: | |||
1079 | SetPageLRU(page); | 1106 | SetPageLRU(page); |
1080 | VM_BUG_ON(!PageActive(page)); | 1107 | VM_BUG_ON(!PageActive(page)); |
1081 | list_move(&page->lru, &zone->active_list); | 1108 | list_move(&page->lru, &zone->active_list); |
1109 | mem_cgroup_move_lists(page_get_page_cgroup(page), true); | ||
1082 | pgmoved++; | 1110 | pgmoved++; |
1083 | if (!pagevec_add(&pvec, page)) { | 1111 | if (!pagevec_add(&pvec, page)) { |
1084 | __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); | 1112 | __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); |
@@ -1206,7 +1234,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
1206 | * holds filesystem locks which prevent writeout this might not work, and the | 1234 | * holds filesystem locks which prevent writeout this might not work, and the |
1207 | * allocation attempt will fail. | 1235 | * allocation attempt will fail. |
1208 | */ | 1236 | */ |
1209 | unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | 1237 | static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, |
1238 | struct scan_control *sc) | ||
1210 | { | 1239 | { |
1211 | int priority; | 1240 | int priority; |
1212 | int ret = 0; | 1241 | int ret = 0; |
@@ -1215,14 +1244,6 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | |||
1215 | struct reclaim_state *reclaim_state = current->reclaim_state; | 1244 | struct reclaim_state *reclaim_state = current->reclaim_state; |
1216 | unsigned long lru_pages = 0; | 1245 | unsigned long lru_pages = 0; |
1217 | int i; | 1246 | int i; |
1218 | struct scan_control sc = { | ||
1219 | .gfp_mask = gfp_mask, | ||
1220 | .may_writepage = !laptop_mode, | ||
1221 | .swap_cluster_max = SWAP_CLUSTER_MAX, | ||
1222 | .may_swap = 1, | ||
1223 | .swappiness = vm_swappiness, | ||
1224 | .order = order, | ||
1225 | }; | ||
1226 | 1247 | ||
1227 | count_vm_event(ALLOCSTALL); | 1248 | count_vm_event(ALLOCSTALL); |
1228 | 1249 | ||
@@ -1237,17 +1258,22 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | |||
1237 | } | 1258 | } |
1238 | 1259 | ||
1239 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 1260 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
1240 | sc.nr_scanned = 0; | 1261 | sc->nr_scanned = 0; |
1241 | if (!priority) | 1262 | if (!priority) |
1242 | disable_swap_token(); | 1263 | disable_swap_token(); |
1243 | nr_reclaimed += shrink_zones(priority, zones, &sc); | 1264 | nr_reclaimed += shrink_zones(priority, zones, sc); |
1244 | shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); | 1265 | /* |
1266 | * Don't shrink slabs when reclaiming memory from | ||
1267 | * over limit cgroups | ||
1268 | */ | ||
1269 | if (sc->mem_cgroup == NULL) | ||
1270 | shrink_slab(sc->nr_scanned, gfp_mask, lru_pages); | ||
1245 | if (reclaim_state) { | 1271 | if (reclaim_state) { |
1246 | nr_reclaimed += reclaim_state->reclaimed_slab; | 1272 | nr_reclaimed += reclaim_state->reclaimed_slab; |
1247 | reclaim_state->reclaimed_slab = 0; | 1273 | reclaim_state->reclaimed_slab = 0; |
1248 | } | 1274 | } |
1249 | total_scanned += sc.nr_scanned; | 1275 | total_scanned += sc->nr_scanned; |
1250 | if (nr_reclaimed >= sc.swap_cluster_max) { | 1276 | if (nr_reclaimed >= sc->swap_cluster_max) { |
1251 | ret = 1; | 1277 | ret = 1; |
1252 | goto out; | 1278 | goto out; |
1253 | } | 1279 | } |
@@ -1259,18 +1285,18 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | |||
1259 | * that's undesirable in laptop mode, where we *want* lumpy | 1285 | * that's undesirable in laptop mode, where we *want* lumpy |
1260 | * writeout. So in laptop mode, write out the whole world. | 1286 | * writeout. So in laptop mode, write out the whole world. |
1261 | */ | 1287 | */ |
1262 | if (total_scanned > sc.swap_cluster_max + | 1288 | if (total_scanned > sc->swap_cluster_max + |
1263 | sc.swap_cluster_max / 2) { | 1289 | sc->swap_cluster_max / 2) { |
1264 | wakeup_pdflush(laptop_mode ? 0 : total_scanned); | 1290 | wakeup_pdflush(laptop_mode ? 0 : total_scanned); |
1265 | sc.may_writepage = 1; | 1291 | sc->may_writepage = 1; |
1266 | } | 1292 | } |
1267 | 1293 | ||
1268 | /* Take a nap, wait for some writeback to complete */ | 1294 | /* Take a nap, wait for some writeback to complete */ |
1269 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) | 1295 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) |
1270 | congestion_wait(WRITE, HZ/10); | 1296 | congestion_wait(WRITE, HZ/10); |
1271 | } | 1297 | } |
1272 | /* top priority shrink_caches still had more to do? don't OOM, then */ | 1298 | /* top priority shrink_caches still had more to do? don't OOM, then */ |
1273 | if (!sc.all_unreclaimable) | 1299 | if (!sc->all_unreclaimable && sc->mem_cgroup == NULL) |
1274 | ret = 1; | 1300 | ret = 1; |
1275 | out: | 1301 | out: |
1276 | /* | 1302 | /* |
@@ -1293,6 +1319,54 @@ out: | |||
1293 | return ret; | 1319 | return ret; |
1294 | } | 1320 | } |
1295 | 1321 | ||
1322 | unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | ||
1323 | { | ||
1324 | struct scan_control sc = { | ||
1325 | .gfp_mask = gfp_mask, | ||
1326 | .may_writepage = !laptop_mode, | ||
1327 | .swap_cluster_max = SWAP_CLUSTER_MAX, | ||
1328 | .may_swap = 1, | ||
1329 | .swappiness = vm_swappiness, | ||
1330 | .order = order, | ||
1331 | .mem_cgroup = NULL, | ||
1332 | .isolate_pages = isolate_pages_global, | ||
1333 | }; | ||
1334 | |||
1335 | return do_try_to_free_pages(zones, gfp_mask, &sc); | ||
1336 | } | ||
1337 | |||
1338 | #ifdef CONFIG_CGROUP_MEM_CONT | ||
1339 | |||
1340 | #ifdef CONFIG_HIGHMEM | ||
1341 | #define ZONE_USERPAGES ZONE_HIGHMEM | ||
1342 | #else | ||
1343 | #define ZONE_USERPAGES ZONE_NORMAL | ||
1344 | #endif | ||
1345 | |||
1346 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont) | ||
1347 | { | ||
1348 | struct scan_control sc = { | ||
1349 | .gfp_mask = GFP_KERNEL, | ||
1350 | .may_writepage = !laptop_mode, | ||
1351 | .may_swap = 1, | ||
1352 | .swap_cluster_max = SWAP_CLUSTER_MAX, | ||
1353 | .swappiness = vm_swappiness, | ||
1354 | .order = 0, | ||
1355 | .mem_cgroup = mem_cont, | ||
1356 | .isolate_pages = mem_cgroup_isolate_pages, | ||
1357 | }; | ||
1358 | int node; | ||
1359 | struct zone **zones; | ||
1360 | |||
1361 | for_each_online_node(node) { | ||
1362 | zones = NODE_DATA(node)->node_zonelists[ZONE_USERPAGES].zones; | ||
1363 | if (do_try_to_free_pages(zones, sc.gfp_mask, &sc)) | ||
1364 | return 1; | ||
1365 | } | ||
1366 | return 0; | ||
1367 | } | ||
1368 | #endif | ||
1369 | |||
1296 | /* | 1370 | /* |
1297 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1371 | * For kswapd, balance_pgdat() will work across all this node's zones until |
1298 | * they are all at pages_high. | 1372 | * they are all at pages_high. |
@@ -1328,6 +1402,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
1328 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1402 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
1329 | .swappiness = vm_swappiness, | 1403 | .swappiness = vm_swappiness, |
1330 | .order = order, | 1404 | .order = order, |
1405 | .mem_cgroup = NULL, | ||
1406 | .isolate_pages = isolate_pages_global, | ||
1331 | }; | 1407 | }; |
1332 | /* | 1408 | /* |
1333 | * temp_priority is used to remember the scanning priority at which | 1409 | * temp_priority is used to remember the scanning priority at which |
@@ -1649,6 +1725,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1649 | .swap_cluster_max = nr_pages, | 1725 | .swap_cluster_max = nr_pages, |
1650 | .may_writepage = 1, | 1726 | .may_writepage = 1, |
1651 | .swappiness = vm_swappiness, | 1727 | .swappiness = vm_swappiness, |
1728 | .isolate_pages = isolate_pages_global, | ||
1652 | }; | 1729 | }; |
1653 | 1730 | ||
1654 | current->reclaim_state = &reclaim_state; | 1731 | current->reclaim_state = &reclaim_state; |
@@ -1834,6 +1911,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1834 | SWAP_CLUSTER_MAX), | 1911 | SWAP_CLUSTER_MAX), |
1835 | .gfp_mask = gfp_mask, | 1912 | .gfp_mask = gfp_mask, |
1836 | .swappiness = vm_swappiness, | 1913 | .swappiness = vm_swappiness, |
1914 | .isolate_pages = isolate_pages_global, | ||
1837 | }; | 1915 | }; |
1838 | unsigned long slab_reclaimable; | 1916 | unsigned long slab_reclaimable; |
1839 | 1917 | ||