diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 365 |
1 files changed, 138 insertions, 227 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 631002d085d1..8b9f6cae938e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -137,14 +137,21 @@ struct mem_cgroup { | |||
137 | */ | 137 | */ |
138 | struct mem_cgroup_stat stat; | 138 | struct mem_cgroup_stat stat; |
139 | }; | 139 | }; |
140 | static struct mem_cgroup init_mem_cgroup; | ||
140 | 141 | ||
141 | /* | 142 | /* |
142 | * We use the lower bit of the page->page_cgroup pointer as a bit spin | 143 | * We use the lower bit of the page->page_cgroup pointer as a bit spin |
143 | * lock. We need to ensure that page->page_cgroup is atleast two | 144 | * lock. We need to ensure that page->page_cgroup is at least two |
144 | * byte aligned (based on comments from Nick Piggin) | 145 | * byte aligned (based on comments from Nick Piggin). But since |
146 | * bit_spin_lock doesn't actually set that lock bit in a non-debug | ||
147 | * uniprocessor kernel, we should avoid setting it here too. | ||
145 | */ | 148 | */ |
146 | #define PAGE_CGROUP_LOCK_BIT 0x0 | 149 | #define PAGE_CGROUP_LOCK_BIT 0x0 |
147 | #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) | 150 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) |
151 | #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) | ||
152 | #else | ||
153 | #define PAGE_CGROUP_LOCK 0x0 | ||
154 | #endif | ||
148 | 155 | ||
149 | /* | 156 | /* |
150 | * A page_cgroup page is associated with every page descriptor. The | 157 | * A page_cgroup page is associated with every page descriptor. The |
@@ -154,37 +161,27 @@ struct page_cgroup { | |||
154 | struct list_head lru; /* per cgroup LRU list */ | 161 | struct list_head lru; /* per cgroup LRU list */ |
155 | struct page *page; | 162 | struct page *page; |
156 | struct mem_cgroup *mem_cgroup; | 163 | struct mem_cgroup *mem_cgroup; |
157 | atomic_t ref_cnt; /* Helpful when pages move b/w */ | 164 | int ref_cnt; /* cached, mapped, migrating */ |
158 | /* mapped and cached states */ | 165 | int flags; |
159 | int flags; | ||
160 | }; | 166 | }; |
161 | #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ | 167 | #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ |
162 | #define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ | 168 | #define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ |
163 | 169 | ||
164 | static inline int page_cgroup_nid(struct page_cgroup *pc) | 170 | static int page_cgroup_nid(struct page_cgroup *pc) |
165 | { | 171 | { |
166 | return page_to_nid(pc->page); | 172 | return page_to_nid(pc->page); |
167 | } | 173 | } |
168 | 174 | ||
169 | static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) | 175 | static enum zone_type page_cgroup_zid(struct page_cgroup *pc) |
170 | { | 176 | { |
171 | return page_zonenum(pc->page); | 177 | return page_zonenum(pc->page); |
172 | } | 178 | } |
173 | 179 | ||
174 | enum { | ||
175 | MEM_CGROUP_TYPE_UNSPEC = 0, | ||
176 | MEM_CGROUP_TYPE_MAPPED, | ||
177 | MEM_CGROUP_TYPE_CACHED, | ||
178 | MEM_CGROUP_TYPE_ALL, | ||
179 | MEM_CGROUP_TYPE_MAX, | ||
180 | }; | ||
181 | |||
182 | enum charge_type { | 180 | enum charge_type { |
183 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 181 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
184 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 182 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
185 | }; | 183 | }; |
186 | 184 | ||
187 | |||
188 | /* | 185 | /* |
189 | * Always modified under lru lock. Then, not necessary to preempt_disable() | 186 | * Always modified under lru lock. Then, not necessary to preempt_disable() |
190 | */ | 187 | */ |
@@ -193,23 +190,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, | |||
193 | { | 190 | { |
194 | int val = (charge)? 1 : -1; | 191 | int val = (charge)? 1 : -1; |
195 | struct mem_cgroup_stat *stat = &mem->stat; | 192 | struct mem_cgroup_stat *stat = &mem->stat; |
196 | VM_BUG_ON(!irqs_disabled()); | ||
197 | 193 | ||
194 | VM_BUG_ON(!irqs_disabled()); | ||
198 | if (flags & PAGE_CGROUP_FLAG_CACHE) | 195 | if (flags & PAGE_CGROUP_FLAG_CACHE) |
199 | __mem_cgroup_stat_add_safe(stat, | 196 | __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val); |
200 | MEM_CGROUP_STAT_CACHE, val); | ||
201 | else | 197 | else |
202 | __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); | 198 | __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); |
203 | } | 199 | } |
204 | 200 | ||
205 | static inline struct mem_cgroup_per_zone * | 201 | static struct mem_cgroup_per_zone * |
206 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | 202 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) |
207 | { | 203 | { |
208 | BUG_ON(!mem->info.nodeinfo[nid]); | ||
209 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | 204 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; |
210 | } | 205 | } |
211 | 206 | ||
212 | static inline struct mem_cgroup_per_zone * | 207 | static struct mem_cgroup_per_zone * |
213 | page_cgroup_zoneinfo(struct page_cgroup *pc) | 208 | page_cgroup_zoneinfo(struct page_cgroup *pc) |
214 | { | 209 | { |
215 | struct mem_cgroup *mem = pc->mem_cgroup; | 210 | struct mem_cgroup *mem = pc->mem_cgroup; |
@@ -234,18 +229,14 @@ static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, | |||
234 | return total; | 229 | return total; |
235 | } | 230 | } |
236 | 231 | ||
237 | static struct mem_cgroup init_mem_cgroup; | 232 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) |
238 | |||
239 | static inline | ||
240 | struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | ||
241 | { | 233 | { |
242 | return container_of(cgroup_subsys_state(cont, | 234 | return container_of(cgroup_subsys_state(cont, |
243 | mem_cgroup_subsys_id), struct mem_cgroup, | 235 | mem_cgroup_subsys_id), struct mem_cgroup, |
244 | css); | 236 | css); |
245 | } | 237 | } |
246 | 238 | ||
247 | static inline | 239 | static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) |
248 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | ||
249 | { | 240 | { |
250 | return container_of(task_subsys_state(p, mem_cgroup_subsys_id), | 241 | return container_of(task_subsys_state(p, mem_cgroup_subsys_id), |
251 | struct mem_cgroup, css); | 242 | struct mem_cgroup, css); |
@@ -267,81 +258,33 @@ void mm_free_cgroup(struct mm_struct *mm) | |||
267 | 258 | ||
268 | static inline int page_cgroup_locked(struct page *page) | 259 | static inline int page_cgroup_locked(struct page *page) |
269 | { | 260 | { |
270 | return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, | 261 | return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); |
271 | &page->page_cgroup); | ||
272 | } | 262 | } |
273 | 263 | ||
274 | void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) | 264 | static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) |
275 | { | 265 | { |
276 | int locked; | 266 | VM_BUG_ON(!page_cgroup_locked(page)); |
277 | 267 | page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK); | |
278 | /* | ||
279 | * While resetting the page_cgroup we might not hold the | ||
280 | * page_cgroup lock. free_hot_cold_page() is an example | ||
281 | * of such a scenario | ||
282 | */ | ||
283 | if (pc) | ||
284 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
285 | locked = (page->page_cgroup & PAGE_CGROUP_LOCK); | ||
286 | page->page_cgroup = ((unsigned long)pc | locked); | ||
287 | } | 268 | } |
288 | 269 | ||
289 | struct page_cgroup *page_get_page_cgroup(struct page *page) | 270 | struct page_cgroup *page_get_page_cgroup(struct page *page) |
290 | { | 271 | { |
291 | return (struct page_cgroup *) | 272 | return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK); |
292 | (page->page_cgroup & ~PAGE_CGROUP_LOCK); | ||
293 | } | 273 | } |
294 | 274 | ||
295 | static void __always_inline lock_page_cgroup(struct page *page) | 275 | static void lock_page_cgroup(struct page *page) |
296 | { | 276 | { |
297 | bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | 277 | bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); |
298 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
299 | } | ||
300 | |||
301 | static void __always_inline unlock_page_cgroup(struct page *page) | ||
302 | { | ||
303 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
304 | } | 278 | } |
305 | 279 | ||
306 | /* | 280 | static int try_lock_page_cgroup(struct page *page) |
307 | * Tie new page_cgroup to struct page under lock_page_cgroup() | ||
308 | * This can fail if the page has been tied to a page_cgroup. | ||
309 | * If success, returns 0. | ||
310 | */ | ||
311 | static int page_cgroup_assign_new_page_cgroup(struct page *page, | ||
312 | struct page_cgroup *pc) | ||
313 | { | 281 | { |
314 | int ret = 0; | 282 | return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); |
315 | |||
316 | lock_page_cgroup(page); | ||
317 | if (!page_get_page_cgroup(page)) | ||
318 | page_assign_page_cgroup(page, pc); | ||
319 | else /* A page is tied to other pc. */ | ||
320 | ret = 1; | ||
321 | unlock_page_cgroup(page); | ||
322 | return ret; | ||
323 | } | 283 | } |
324 | 284 | ||
325 | /* | 285 | static void unlock_page_cgroup(struct page *page) |
326 | * Clear page->page_cgroup member under lock_page_cgroup(). | ||
327 | * If given "pc" value is different from one page->page_cgroup, | ||
328 | * page->cgroup is not cleared. | ||
329 | * Returns a value of page->page_cgroup at lock taken. | ||
330 | * A can can detect failure of clearing by following | ||
331 | * clear_page_cgroup(page, pc) == pc | ||
332 | */ | ||
333 | |||
334 | static struct page_cgroup *clear_page_cgroup(struct page *page, | ||
335 | struct page_cgroup *pc) | ||
336 | { | 286 | { |
337 | struct page_cgroup *ret; | 287 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); |
338 | /* lock and clear */ | ||
339 | lock_page_cgroup(page); | ||
340 | ret = page_get_page_cgroup(page); | ||
341 | if (likely(ret == pc)) | ||
342 | page_assign_page_cgroup(page, NULL); | ||
343 | unlock_page_cgroup(page); | ||
344 | return ret; | ||
345 | } | 288 | } |
346 | 289 | ||
347 | static void __mem_cgroup_remove_list(struct page_cgroup *pc) | 290 | static void __mem_cgroup_remove_list(struct page_cgroup *pc) |
@@ -399,7 +342,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
399 | int ret; | 342 | int ret; |
400 | 343 | ||
401 | task_lock(task); | 344 | task_lock(task); |
402 | ret = task->mm && vm_match_cgroup(task->mm, mem); | 345 | ret = task->mm && mm_match_cgroup(task->mm, mem); |
403 | task_unlock(task); | 346 | task_unlock(task); |
404 | return ret; | 347 | return ret; |
405 | } | 348 | } |
@@ -407,18 +350,30 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
407 | /* | 350 | /* |
408 | * This routine assumes that the appropriate zone's lru lock is already held | 351 | * This routine assumes that the appropriate zone's lru lock is already held |
409 | */ | 352 | */ |
410 | void mem_cgroup_move_lists(struct page_cgroup *pc, bool active) | 353 | void mem_cgroup_move_lists(struct page *page, bool active) |
411 | { | 354 | { |
355 | struct page_cgroup *pc; | ||
412 | struct mem_cgroup_per_zone *mz; | 356 | struct mem_cgroup_per_zone *mz; |
413 | unsigned long flags; | 357 | unsigned long flags; |
414 | 358 | ||
415 | if (!pc) | 359 | /* |
360 | * We cannot lock_page_cgroup while holding zone's lru_lock, | ||
361 | * because other holders of lock_page_cgroup can be interrupted | ||
362 | * with an attempt to rotate_reclaimable_page. But we cannot | ||
363 | * safely get to page_cgroup without it, so just try_lock it: | ||
364 | * mem_cgroup_isolate_pages allows for page left on wrong list. | ||
365 | */ | ||
366 | if (!try_lock_page_cgroup(page)) | ||
416 | return; | 367 | return; |
417 | 368 | ||
418 | mz = page_cgroup_zoneinfo(pc); | 369 | pc = page_get_page_cgroup(page); |
419 | spin_lock_irqsave(&mz->lru_lock, flags); | 370 | if (pc) { |
420 | __mem_cgroup_move_lists(pc, active); | 371 | mz = page_cgroup_zoneinfo(pc); |
421 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 372 | spin_lock_irqsave(&mz->lru_lock, flags); |
373 | __mem_cgroup_move_lists(pc, active); | ||
374 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
375 | } | ||
376 | unlock_page_cgroup(page); | ||
422 | } | 377 | } |
423 | 378 | ||
424 | /* | 379 | /* |
@@ -437,6 +392,7 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) | |||
437 | rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | 392 | rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); |
438 | return (int)((rss * 100L) / total); | 393 | return (int)((rss * 100L) / total); |
439 | } | 394 | } |
395 | |||
440 | /* | 396 | /* |
441 | * This function is called from vmscan.c. In page reclaiming loop. balance | 397 | * This function is called from vmscan.c. In page reclaiming loop. balance |
442 | * between active and inactive list is calculated. For memory controller | 398 | * between active and inactive list is calculated. For memory controller |
@@ -500,7 +456,6 @@ long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, | |||
500 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); | 456 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); |
501 | 457 | ||
502 | nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); | 458 | nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); |
503 | |||
504 | return (nr_inactive >> priority); | 459 | return (nr_inactive >> priority); |
505 | } | 460 | } |
506 | 461 | ||
@@ -586,26 +541,21 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
586 | * with it | 541 | * with it |
587 | */ | 542 | */ |
588 | retry: | 543 | retry: |
589 | if (page) { | 544 | lock_page_cgroup(page); |
590 | lock_page_cgroup(page); | 545 | pc = page_get_page_cgroup(page); |
591 | pc = page_get_page_cgroup(page); | 546 | /* |
592 | /* | 547 | * The page_cgroup exists and |
593 | * The page_cgroup exists and | 548 | * the page has already been accounted. |
594 | * the page has already been accounted. | 549 | */ |
595 | */ | 550 | if (pc) { |
596 | if (pc) { | 551 | VM_BUG_ON(pc->page != page); |
597 | if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) { | 552 | VM_BUG_ON(pc->ref_cnt <= 0); |
598 | /* this page is under being uncharged ? */ | 553 | |
599 | unlock_page_cgroup(page); | 554 | pc->ref_cnt++; |
600 | cpu_relax(); | ||
601 | goto retry; | ||
602 | } else { | ||
603 | unlock_page_cgroup(page); | ||
604 | goto done; | ||
605 | } | ||
606 | } | ||
607 | unlock_page_cgroup(page); | 555 | unlock_page_cgroup(page); |
556 | goto done; | ||
608 | } | 557 | } |
558 | unlock_page_cgroup(page); | ||
609 | 559 | ||
610 | pc = kzalloc(sizeof(struct page_cgroup), gfp_mask); | 560 | pc = kzalloc(sizeof(struct page_cgroup), gfp_mask); |
611 | if (pc == NULL) | 561 | if (pc == NULL) |
@@ -623,16 +573,11 @@ retry: | |||
623 | rcu_read_lock(); | 573 | rcu_read_lock(); |
624 | mem = rcu_dereference(mm->mem_cgroup); | 574 | mem = rcu_dereference(mm->mem_cgroup); |
625 | /* | 575 | /* |
626 | * For every charge from the cgroup, increment reference | 576 | * For every charge from the cgroup, increment reference count |
627 | * count | ||
628 | */ | 577 | */ |
629 | css_get(&mem->css); | 578 | css_get(&mem->css); |
630 | rcu_read_unlock(); | 579 | rcu_read_unlock(); |
631 | 580 | ||
632 | /* | ||
633 | * If we created the page_cgroup, we should free it on exceeding | ||
634 | * the cgroup limit. | ||
635 | */ | ||
636 | while (res_counter_charge(&mem->res, PAGE_SIZE)) { | 581 | while (res_counter_charge(&mem->res, PAGE_SIZE)) { |
637 | if (!(gfp_mask & __GFP_WAIT)) | 582 | if (!(gfp_mask & __GFP_WAIT)) |
638 | goto out; | 583 | goto out; |
@@ -641,12 +586,12 @@ retry: | |||
641 | continue; | 586 | continue; |
642 | 587 | ||
643 | /* | 588 | /* |
644 | * try_to_free_mem_cgroup_pages() might not give us a full | 589 | * try_to_free_mem_cgroup_pages() might not give us a full |
645 | * picture of reclaim. Some pages are reclaimed and might be | 590 | * picture of reclaim. Some pages are reclaimed and might be |
646 | * moved to swap cache or just unmapped from the cgroup. | 591 | * moved to swap cache or just unmapped from the cgroup. |
647 | * Check the limit again to see if the reclaim reduced the | 592 | * Check the limit again to see if the reclaim reduced the |
648 | * current usage of the cgroup before giving up | 593 | * current usage of the cgroup before giving up |
649 | */ | 594 | */ |
650 | if (res_counter_check_under_limit(&mem->res)) | 595 | if (res_counter_check_under_limit(&mem->res)) |
651 | continue; | 596 | continue; |
652 | 597 | ||
@@ -657,14 +602,16 @@ retry: | |||
657 | congestion_wait(WRITE, HZ/10); | 602 | congestion_wait(WRITE, HZ/10); |
658 | } | 603 | } |
659 | 604 | ||
660 | atomic_set(&pc->ref_cnt, 1); | 605 | pc->ref_cnt = 1; |
661 | pc->mem_cgroup = mem; | 606 | pc->mem_cgroup = mem; |
662 | pc->page = page; | 607 | pc->page = page; |
663 | pc->flags = PAGE_CGROUP_FLAG_ACTIVE; | 608 | pc->flags = PAGE_CGROUP_FLAG_ACTIVE; |
664 | if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) | 609 | if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) |
665 | pc->flags |= PAGE_CGROUP_FLAG_CACHE; | 610 | pc->flags |= PAGE_CGROUP_FLAG_CACHE; |
666 | 611 | ||
667 | if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) { | 612 | lock_page_cgroup(page); |
613 | if (page_get_page_cgroup(page)) { | ||
614 | unlock_page_cgroup(page); | ||
668 | /* | 615 | /* |
669 | * Another charge has been added to this page already. | 616 | * Another charge has been added to this page already. |
670 | * We take lock_page_cgroup(page) again and read | 617 | * We take lock_page_cgroup(page) again and read |
@@ -673,17 +620,16 @@ retry: | |||
673 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 620 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
674 | css_put(&mem->css); | 621 | css_put(&mem->css); |
675 | kfree(pc); | 622 | kfree(pc); |
676 | if (!page) | ||
677 | goto done; | ||
678 | goto retry; | 623 | goto retry; |
679 | } | 624 | } |
625 | page_assign_page_cgroup(page, pc); | ||
680 | 626 | ||
681 | mz = page_cgroup_zoneinfo(pc); | 627 | mz = page_cgroup_zoneinfo(pc); |
682 | spin_lock_irqsave(&mz->lru_lock, flags); | 628 | spin_lock_irqsave(&mz->lru_lock, flags); |
683 | /* Update statistics vector */ | ||
684 | __mem_cgroup_add_list(pc); | 629 | __mem_cgroup_add_list(pc); |
685 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 630 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
686 | 631 | ||
632 | unlock_page_cgroup(page); | ||
687 | done: | 633 | done: |
688 | return 0; | 634 | return 0; |
689 | out: | 635 | out: |
@@ -693,70 +639,61 @@ err: | |||
693 | return -ENOMEM; | 639 | return -ENOMEM; |
694 | } | 640 | } |
695 | 641 | ||
696 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm, | 642 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) |
697 | gfp_t gfp_mask) | ||
698 | { | 643 | { |
699 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 644 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
700 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 645 | MEM_CGROUP_CHARGE_TYPE_MAPPED); |
701 | } | 646 | } |
702 | 647 | ||
703 | /* | ||
704 | * See if the cached pages should be charged at all? | ||
705 | */ | ||
706 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 648 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
707 | gfp_t gfp_mask) | 649 | gfp_t gfp_mask) |
708 | { | 650 | { |
709 | int ret = 0; | ||
710 | if (!mm) | 651 | if (!mm) |
711 | mm = &init_mm; | 652 | mm = &init_mm; |
712 | 653 | return mem_cgroup_charge_common(page, mm, gfp_mask, | |
713 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, | ||
714 | MEM_CGROUP_CHARGE_TYPE_CACHE); | 654 | MEM_CGROUP_CHARGE_TYPE_CACHE); |
715 | return ret; | ||
716 | } | 655 | } |
717 | 656 | ||
718 | /* | 657 | /* |
719 | * Uncharging is always a welcome operation, we never complain, simply | 658 | * Uncharging is always a welcome operation, we never complain, simply |
720 | * uncharge. This routine should be called with lock_page_cgroup held | 659 | * uncharge. |
721 | */ | 660 | */ |
722 | void mem_cgroup_uncharge(struct page_cgroup *pc) | 661 | void mem_cgroup_uncharge_page(struct page *page) |
723 | { | 662 | { |
663 | struct page_cgroup *pc; | ||
724 | struct mem_cgroup *mem; | 664 | struct mem_cgroup *mem; |
725 | struct mem_cgroup_per_zone *mz; | 665 | struct mem_cgroup_per_zone *mz; |
726 | struct page *page; | ||
727 | unsigned long flags; | 666 | unsigned long flags; |
728 | 667 | ||
729 | /* | 668 | /* |
730 | * Check if our page_cgroup is valid | 669 | * Check if our page_cgroup is valid |
731 | */ | 670 | */ |
671 | lock_page_cgroup(page); | ||
672 | pc = page_get_page_cgroup(page); | ||
732 | if (!pc) | 673 | if (!pc) |
733 | return; | 674 | goto unlock; |
734 | 675 | ||
735 | if (atomic_dec_and_test(&pc->ref_cnt)) { | 676 | VM_BUG_ON(pc->page != page); |
736 | page = pc->page; | 677 | VM_BUG_ON(pc->ref_cnt <= 0); |
678 | |||
679 | if (--(pc->ref_cnt) == 0) { | ||
737 | mz = page_cgroup_zoneinfo(pc); | 680 | mz = page_cgroup_zoneinfo(pc); |
738 | /* | 681 | spin_lock_irqsave(&mz->lru_lock, flags); |
739 | * get page->cgroup and clear it under lock. | 682 | __mem_cgroup_remove_list(pc); |
740 | * force_empty can drop page->cgroup without checking refcnt. | 683 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
741 | */ | 684 | |
685 | page_assign_page_cgroup(page, NULL); | ||
742 | unlock_page_cgroup(page); | 686 | unlock_page_cgroup(page); |
743 | if (clear_page_cgroup(page, pc) == pc) { | 687 | |
744 | mem = pc->mem_cgroup; | 688 | mem = pc->mem_cgroup; |
745 | css_put(&mem->css); | 689 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
746 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 690 | css_put(&mem->css); |
747 | spin_lock_irqsave(&mz->lru_lock, flags); | 691 | |
748 | __mem_cgroup_remove_list(pc); | 692 | kfree(pc); |
749 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 693 | return; |
750 | kfree(pc); | ||
751 | } | ||
752 | lock_page_cgroup(page); | ||
753 | } | 694 | } |
754 | } | ||
755 | 695 | ||
756 | void mem_cgroup_uncharge_page(struct page *page) | 696 | unlock: |
757 | { | ||
758 | lock_page_cgroup(page); | ||
759 | mem_cgroup_uncharge(page_get_page_cgroup(page)); | ||
760 | unlock_page_cgroup(page); | 697 | unlock_page_cgroup(page); |
761 | } | 698 | } |
762 | 699 | ||
@@ -764,63 +701,59 @@ void mem_cgroup_uncharge_page(struct page *page) | |||
764 | * Returns non-zero if a page (under migration) has valid page_cgroup member. | 701 | * Returns non-zero if a page (under migration) has valid page_cgroup member. |
765 | * Refcnt of page_cgroup is incremented. | 702 | * Refcnt of page_cgroup is incremented. |
766 | */ | 703 | */ |
767 | |||
768 | int mem_cgroup_prepare_migration(struct page *page) | 704 | int mem_cgroup_prepare_migration(struct page *page) |
769 | { | 705 | { |
770 | struct page_cgroup *pc; | 706 | struct page_cgroup *pc; |
771 | int ret = 0; | 707 | |
772 | lock_page_cgroup(page); | 708 | lock_page_cgroup(page); |
773 | pc = page_get_page_cgroup(page); | 709 | pc = page_get_page_cgroup(page); |
774 | if (pc && atomic_inc_not_zero(&pc->ref_cnt)) | 710 | if (pc) |
775 | ret = 1; | 711 | pc->ref_cnt++; |
776 | unlock_page_cgroup(page); | 712 | unlock_page_cgroup(page); |
777 | return ret; | 713 | return pc != NULL; |
778 | } | 714 | } |
779 | 715 | ||
780 | void mem_cgroup_end_migration(struct page *page) | 716 | void mem_cgroup_end_migration(struct page *page) |
781 | { | 717 | { |
782 | struct page_cgroup *pc; | 718 | mem_cgroup_uncharge_page(page); |
783 | |||
784 | lock_page_cgroup(page); | ||
785 | pc = page_get_page_cgroup(page); | ||
786 | mem_cgroup_uncharge(pc); | ||
787 | unlock_page_cgroup(page); | ||
788 | } | 719 | } |
720 | |||
789 | /* | 721 | /* |
790 | * We know both *page* and *newpage* are now not-on-LRU and Pg_locked. | 722 | * We know both *page* and *newpage* are now not-on-LRU and PG_locked. |
791 | * And no race with uncharge() routines because page_cgroup for *page* | 723 | * And no race with uncharge() routines because page_cgroup for *page* |
792 | * has extra one reference by mem_cgroup_prepare_migration. | 724 | * has extra one reference by mem_cgroup_prepare_migration. |
793 | */ | 725 | */ |
794 | |||
795 | void mem_cgroup_page_migration(struct page *page, struct page *newpage) | 726 | void mem_cgroup_page_migration(struct page *page, struct page *newpage) |
796 | { | 727 | { |
797 | struct page_cgroup *pc; | 728 | struct page_cgroup *pc; |
798 | struct mem_cgroup *mem; | ||
799 | unsigned long flags; | ||
800 | struct mem_cgroup_per_zone *mz; | 729 | struct mem_cgroup_per_zone *mz; |
801 | retry: | 730 | unsigned long flags; |
731 | |||
732 | lock_page_cgroup(page); | ||
802 | pc = page_get_page_cgroup(page); | 733 | pc = page_get_page_cgroup(page); |
803 | if (!pc) | 734 | if (!pc) { |
735 | unlock_page_cgroup(page); | ||
804 | return; | 736 | return; |
805 | mem = pc->mem_cgroup; | 737 | } |
738 | |||
806 | mz = page_cgroup_zoneinfo(pc); | 739 | mz = page_cgroup_zoneinfo(pc); |
807 | if (clear_page_cgroup(page, pc) != pc) | ||
808 | goto retry; | ||
809 | spin_lock_irqsave(&mz->lru_lock, flags); | 740 | spin_lock_irqsave(&mz->lru_lock, flags); |
810 | |||
811 | __mem_cgroup_remove_list(pc); | 741 | __mem_cgroup_remove_list(pc); |
812 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 742 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
813 | 743 | ||
744 | page_assign_page_cgroup(page, NULL); | ||
745 | unlock_page_cgroup(page); | ||
746 | |||
814 | pc->page = newpage; | 747 | pc->page = newpage; |
815 | lock_page_cgroup(newpage); | 748 | lock_page_cgroup(newpage); |
816 | page_assign_page_cgroup(newpage, pc); | 749 | page_assign_page_cgroup(newpage, pc); |
817 | unlock_page_cgroup(newpage); | ||
818 | 750 | ||
819 | mz = page_cgroup_zoneinfo(pc); | 751 | mz = page_cgroup_zoneinfo(pc); |
820 | spin_lock_irqsave(&mz->lru_lock, flags); | 752 | spin_lock_irqsave(&mz->lru_lock, flags); |
821 | __mem_cgroup_add_list(pc); | 753 | __mem_cgroup_add_list(pc); |
822 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 754 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
823 | return; | 755 | |
756 | unlock_page_cgroup(newpage); | ||
824 | } | 757 | } |
825 | 758 | ||
826 | /* | 759 | /* |
@@ -829,14 +762,13 @@ retry: | |||
829 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 762 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
830 | */ | 763 | */ |
831 | #define FORCE_UNCHARGE_BATCH (128) | 764 | #define FORCE_UNCHARGE_BATCH (128) |
832 | static void | 765 | static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, |
833 | mem_cgroup_force_empty_list(struct mem_cgroup *mem, | ||
834 | struct mem_cgroup_per_zone *mz, | 766 | struct mem_cgroup_per_zone *mz, |
835 | int active) | 767 | int active) |
836 | { | 768 | { |
837 | struct page_cgroup *pc; | 769 | struct page_cgroup *pc; |
838 | struct page *page; | 770 | struct page *page; |
839 | int count; | 771 | int count = FORCE_UNCHARGE_BATCH; |
840 | unsigned long flags; | 772 | unsigned long flags; |
841 | struct list_head *list; | 773 | struct list_head *list; |
842 | 774 | ||
@@ -845,46 +777,36 @@ mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
845 | else | 777 | else |
846 | list = &mz->inactive_list; | 778 | list = &mz->inactive_list; |
847 | 779 | ||
848 | if (list_empty(list)) | ||
849 | return; | ||
850 | retry: | ||
851 | count = FORCE_UNCHARGE_BATCH; | ||
852 | spin_lock_irqsave(&mz->lru_lock, flags); | 780 | spin_lock_irqsave(&mz->lru_lock, flags); |
853 | 781 | while (!list_empty(list)) { | |
854 | while (--count && !list_empty(list)) { | ||
855 | pc = list_entry(list->prev, struct page_cgroup, lru); | 782 | pc = list_entry(list->prev, struct page_cgroup, lru); |
856 | page = pc->page; | 783 | page = pc->page; |
857 | /* Avoid race with charge */ | 784 | get_page(page); |
858 | atomic_set(&pc->ref_cnt, 0); | 785 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
859 | if (clear_page_cgroup(page, pc) == pc) { | 786 | mem_cgroup_uncharge_page(page); |
860 | css_put(&mem->css); | 787 | put_page(page); |
861 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 788 | if (--count <= 0) { |
862 | __mem_cgroup_remove_list(pc); | 789 | count = FORCE_UNCHARGE_BATCH; |
863 | kfree(pc); | 790 | cond_resched(); |
864 | } else /* being uncharged ? ...do relax */ | 791 | } |
865 | break; | 792 | spin_lock_irqsave(&mz->lru_lock, flags); |
866 | } | 793 | } |
867 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 794 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
868 | if (!list_empty(list)) { | ||
869 | cond_resched(); | ||
870 | goto retry; | ||
871 | } | ||
872 | return; | ||
873 | } | 795 | } |
874 | 796 | ||
875 | /* | 797 | /* |
876 | * make mem_cgroup's charge to be 0 if there is no task. | 798 | * make mem_cgroup's charge to be 0 if there is no task. |
877 | * This enables deleting this mem_cgroup. | 799 | * This enables deleting this mem_cgroup. |
878 | */ | 800 | */ |
879 | 801 | static int mem_cgroup_force_empty(struct mem_cgroup *mem) | |
880 | int mem_cgroup_force_empty(struct mem_cgroup *mem) | ||
881 | { | 802 | { |
882 | int ret = -EBUSY; | 803 | int ret = -EBUSY; |
883 | int node, zid; | 804 | int node, zid; |
805 | |||
884 | css_get(&mem->css); | 806 | css_get(&mem->css); |
885 | /* | 807 | /* |
886 | * page reclaim code (kswapd etc..) will move pages between | 808 | * page reclaim code (kswapd etc..) will move pages between |
887 | ` * active_list <-> inactive_list while we don't take a lock. | 809 | * active_list <-> inactive_list while we don't take a lock. |
888 | * So, we have to do loop here until all lists are empty. | 810 | * So, we have to do loop here until all lists are empty. |
889 | */ | 811 | */ |
890 | while (mem->res.usage > 0) { | 812 | while (mem->res.usage > 0) { |
@@ -906,9 +828,7 @@ out: | |||
906 | return ret; | 828 | return ret; |
907 | } | 829 | } |
908 | 830 | ||
909 | 831 | static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) | |
910 | |||
911 | int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) | ||
912 | { | 832 | { |
913 | *tmp = memparse(buf, &buf); | 833 | *tmp = memparse(buf, &buf); |
914 | if (*buf != '\0') | 834 | if (*buf != '\0') |
@@ -945,8 +865,7 @@ static ssize_t mem_force_empty_write(struct cgroup *cont, | |||
945 | size_t nbytes, loff_t *ppos) | 865 | size_t nbytes, loff_t *ppos) |
946 | { | 866 | { |
947 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 867 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
948 | int ret; | 868 | int ret = mem_cgroup_force_empty(mem); |
949 | ret = mem_cgroup_force_empty(mem); | ||
950 | if (!ret) | 869 | if (!ret) |
951 | ret = nbytes; | 870 | ret = nbytes; |
952 | return ret; | 871 | return ret; |
@@ -955,7 +874,6 @@ static ssize_t mem_force_empty_write(struct cgroup *cont, | |||
955 | /* | 874 | /* |
956 | * Note: This should be removed if cgroup supports write-only file. | 875 | * Note: This should be removed if cgroup supports write-only file. |
957 | */ | 876 | */ |
958 | |||
959 | static ssize_t mem_force_empty_read(struct cgroup *cont, | 877 | static ssize_t mem_force_empty_read(struct cgroup *cont, |
960 | struct cftype *cft, | 878 | struct cftype *cft, |
961 | struct file *file, char __user *userbuf, | 879 | struct file *file, char __user *userbuf, |
@@ -964,7 +882,6 @@ static ssize_t mem_force_empty_read(struct cgroup *cont, | |||
964 | return -EINVAL; | 882 | return -EINVAL; |
965 | } | 883 | } |
966 | 884 | ||
967 | |||
968 | static const struct mem_cgroup_stat_desc { | 885 | static const struct mem_cgroup_stat_desc { |
969 | const char *msg; | 886 | const char *msg; |
970 | u64 unit; | 887 | u64 unit; |
@@ -1017,8 +934,6 @@ static int mem_control_stat_open(struct inode *unused, struct file *file) | |||
1017 | return single_open(file, mem_control_stat_show, cont); | 934 | return single_open(file, mem_control_stat_show, cont); |
1018 | } | 935 | } |
1019 | 936 | ||
1020 | |||
1021 | |||
1022 | static struct cftype mem_cgroup_files[] = { | 937 | static struct cftype mem_cgroup_files[] = { |
1023 | { | 938 | { |
1024 | .name = "usage_in_bytes", | 939 | .name = "usage_in_bytes", |
@@ -1084,9 +999,6 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
1084 | kfree(mem->info.nodeinfo[node]); | 999 | kfree(mem->info.nodeinfo[node]); |
1085 | } | 1000 | } |
1086 | 1001 | ||
1087 | |||
1088 | static struct mem_cgroup init_mem_cgroup; | ||
1089 | |||
1090 | static struct cgroup_subsys_state * | 1002 | static struct cgroup_subsys_state * |
1091 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | 1003 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
1092 | { | 1004 | { |
@@ -1176,7 +1088,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
1176 | 1088 | ||
1177 | out: | 1089 | out: |
1178 | mmput(mm); | 1090 | mmput(mm); |
1179 | return; | ||
1180 | } | 1091 | } |
1181 | 1092 | ||
1182 | struct cgroup_subsys mem_cgroup_subsys = { | 1093 | struct cgroup_subsys mem_cgroup_subsys = { |