diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2008-10-18 23:28:16 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-10-20 11:52:39 -0400 |
commit | 52d4b9ac0b985168009c2a57098324e67bae171f (patch) | |
tree | b3e3b854166930af893be90ea30a7ab0d65c59e7 /mm/memcontrol.c | |
parent | c05555b572921c464d064d9267f7f7bc06d424fa (diff) |
memcg: allocate all page_cgroup at boot
Allocate all page_cgroup at boot and remove page_cgroup poitner from
struct page. This patch adds an interface as
struct page_cgroup *lookup_page_cgroup(struct page*)
All FLATMEM/DISCONTIGMEM/SPARSEMEM and MEMORY_HOTPLUG is supported.
Remove page_cgroup pointer reduces the amount of memory by
- 4 bytes per PAGE_SIZE.
- 8 bytes per PAGE_SIZE
if memory controller is disabled. (even if configured.)
On usual 8GB x86-32 server, this saves 8MB of NORMAL_ZONE memory.
On my x86-64 server with 48GB of memory, this saves 96MB of memory.
I think this reduction makes sense.
By pre-allocation, kmalloc/kfree in charge/uncharge are removed.
This means
- we're not necessary to be afraid of kmalloc faiulre.
(this can happen because of gfp_mask type.)
- we can avoid calling kmalloc/kfree.
- we can avoid allocating tons of small objects which can be fragmented.
- we can know what amount of memory will be used for this extra-lru handling.
I added printk message as
"allocated %ld bytes of page_cgroup"
"please try cgroup_disable=memory option if you don't want"
maybe enough informative for users.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 247 |
1 files changed, 80 insertions, 167 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 031682e7ef0c..d4a92b63e98e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -33,11 +33,11 @@ | |||
33 | #include <linux/seq_file.h> | 33 | #include <linux/seq_file.h> |
34 | #include <linux/vmalloc.h> | 34 | #include <linux/vmalloc.h> |
35 | #include <linux/mm_inline.h> | 35 | #include <linux/mm_inline.h> |
36 | #include <linux/page_cgroup.h> | ||
36 | 37 | ||
37 | #include <asm/uaccess.h> | 38 | #include <asm/uaccess.h> |
38 | 39 | ||
39 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 40 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
40 | static struct kmem_cache *page_cgroup_cache __read_mostly; | ||
41 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 41 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
42 | 42 | ||
43 | /* | 43 | /* |
@@ -135,79 +135,6 @@ struct mem_cgroup { | |||
135 | }; | 135 | }; |
136 | static struct mem_cgroup init_mem_cgroup; | 136 | static struct mem_cgroup init_mem_cgroup; |
137 | 137 | ||
138 | /* | ||
139 | * We use the lower bit of the page->page_cgroup pointer as a bit spin | ||
140 | * lock. We need to ensure that page->page_cgroup is at least two | ||
141 | * byte aligned (based on comments from Nick Piggin). But since | ||
142 | * bit_spin_lock doesn't actually set that lock bit in a non-debug | ||
143 | * uniprocessor kernel, we should avoid setting it here too. | ||
144 | */ | ||
145 | #define PAGE_CGROUP_LOCK_BIT 0x0 | ||
146 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) | ||
147 | #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) | ||
148 | #else | ||
149 | #define PAGE_CGROUP_LOCK 0x0 | ||
150 | #endif | ||
151 | |||
152 | /* | ||
153 | * A page_cgroup page is associated with every page descriptor. The | ||
154 | * page_cgroup helps us identify information about the cgroup | ||
155 | */ | ||
156 | struct page_cgroup { | ||
157 | struct list_head lru; /* per cgroup LRU list */ | ||
158 | struct page *page; | ||
159 | struct mem_cgroup *mem_cgroup; | ||
160 | unsigned long flags; | ||
161 | }; | ||
162 | |||
163 | enum { | ||
164 | /* flags for mem_cgroup */ | ||
165 | PCG_CACHE, /* charged as cache */ | ||
166 | /* flags for LRU placement */ | ||
167 | PCG_ACTIVE, /* page is active in this cgroup */ | ||
168 | PCG_FILE, /* page is file system backed */ | ||
169 | PCG_UNEVICTABLE, /* page is unevictableable */ | ||
170 | }; | ||
171 | |||
172 | #define TESTPCGFLAG(uname, lname) \ | ||
173 | static inline int PageCgroup##uname(struct page_cgroup *pc) \ | ||
174 | { return test_bit(PCG_##lname, &pc->flags); } | ||
175 | |||
176 | #define SETPCGFLAG(uname, lname) \ | ||
177 | static inline void SetPageCgroup##uname(struct page_cgroup *pc)\ | ||
178 | { set_bit(PCG_##lname, &pc->flags); } | ||
179 | |||
180 | #define CLEARPCGFLAG(uname, lname) \ | ||
181 | static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ | ||
182 | { clear_bit(PCG_##lname, &pc->flags); } | ||
183 | |||
184 | |||
185 | /* Cache flag is set only once (at allocation) */ | ||
186 | TESTPCGFLAG(Cache, CACHE) | ||
187 | |||
188 | /* LRU management flags (from global-lru definition) */ | ||
189 | TESTPCGFLAG(File, FILE) | ||
190 | SETPCGFLAG(File, FILE) | ||
191 | CLEARPCGFLAG(File, FILE) | ||
192 | |||
193 | TESTPCGFLAG(Active, ACTIVE) | ||
194 | SETPCGFLAG(Active, ACTIVE) | ||
195 | CLEARPCGFLAG(Active, ACTIVE) | ||
196 | |||
197 | TESTPCGFLAG(Unevictable, UNEVICTABLE) | ||
198 | SETPCGFLAG(Unevictable, UNEVICTABLE) | ||
199 | CLEARPCGFLAG(Unevictable, UNEVICTABLE) | ||
200 | |||
201 | static int page_cgroup_nid(struct page_cgroup *pc) | ||
202 | { | ||
203 | return page_to_nid(pc->page); | ||
204 | } | ||
205 | |||
206 | static enum zone_type page_cgroup_zid(struct page_cgroup *pc) | ||
207 | { | ||
208 | return page_zonenum(pc->page); | ||
209 | } | ||
210 | |||
211 | enum charge_type { | 138 | enum charge_type { |
212 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 139 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
213 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 140 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
@@ -216,12 +143,18 @@ enum charge_type { | |||
216 | NR_CHARGE_TYPE, | 143 | NR_CHARGE_TYPE, |
217 | }; | 144 | }; |
218 | 145 | ||
146 | /* only for here (for easy reading.) */ | ||
147 | #define PCGF_CACHE (1UL << PCG_CACHE) | ||
148 | #define PCGF_USED (1UL << PCG_USED) | ||
149 | #define PCGF_ACTIVE (1UL << PCG_ACTIVE) | ||
150 | #define PCGF_LOCK (1UL << PCG_LOCK) | ||
151 | #define PCGF_FILE (1UL << PCG_FILE) | ||
219 | static const unsigned long | 152 | static const unsigned long |
220 | pcg_default_flags[NR_CHARGE_TYPE] = { | 153 | pcg_default_flags[NR_CHARGE_TYPE] = { |
221 | ((1 << PCG_CACHE) | (1 << PCG_FILE)), | 154 | PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ |
222 | ((1 << PCG_ACTIVE)), | 155 | PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ |
223 | ((1 << PCG_ACTIVE) | (1 << PCG_CACHE)), | 156 | PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ |
224 | 0, | 157 | 0, /* FORCE */ |
225 | }; | 158 | }; |
226 | 159 | ||
227 | /* | 160 | /* |
@@ -303,37 +236,6 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
303 | struct mem_cgroup, css); | 236 | struct mem_cgroup, css); |
304 | } | 237 | } |
305 | 238 | ||
306 | static inline int page_cgroup_locked(struct page *page) | ||
307 | { | ||
308 | return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
309 | } | ||
310 | |||
311 | static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) | ||
312 | { | ||
313 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
314 | page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK); | ||
315 | } | ||
316 | |||
317 | struct page_cgroup *page_get_page_cgroup(struct page *page) | ||
318 | { | ||
319 | return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK); | ||
320 | } | ||
321 | |||
322 | static void lock_page_cgroup(struct page *page) | ||
323 | { | ||
324 | bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
325 | } | ||
326 | |||
327 | static int try_lock_page_cgroup(struct page *page) | ||
328 | { | ||
329 | return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
330 | } | ||
331 | |||
332 | static void unlock_page_cgroup(struct page *page) | ||
333 | { | ||
334 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
335 | } | ||
336 | |||
337 | static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, | 239 | static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, |
338 | struct page_cgroup *pc) | 240 | struct page_cgroup *pc) |
339 | { | 241 | { |
@@ -436,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, enum lru_list lru) | |||
436 | * safely get to page_cgroup without it, so just try_lock it: | 338 | * safely get to page_cgroup without it, so just try_lock it: |
437 | * mem_cgroup_isolate_pages allows for page left on wrong list. | 339 | * mem_cgroup_isolate_pages allows for page left on wrong list. |
438 | */ | 340 | */ |
439 | if (!try_lock_page_cgroup(page)) | 341 | pc = lookup_page_cgroup(page); |
342 | if (!trylock_page_cgroup(pc)) | ||
440 | return; | 343 | return; |
441 | 344 | if (pc && PageCgroupUsed(pc)) { | |
442 | pc = page_get_page_cgroup(page); | ||
443 | if (pc) { | ||
444 | mz = page_cgroup_zoneinfo(pc); | 345 | mz = page_cgroup_zoneinfo(pc); |
445 | spin_lock_irqsave(&mz->lru_lock, flags); | 346 | spin_lock_irqsave(&mz->lru_lock, flags); |
446 | __mem_cgroup_move_lists(pc, lru); | 347 | __mem_cgroup_move_lists(pc, lru); |
447 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 348 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
448 | } | 349 | } |
449 | unlock_page_cgroup(page); | 350 | unlock_page_cgroup(pc); |
450 | } | 351 | } |
451 | 352 | ||
452 | /* | 353 | /* |
@@ -533,6 +434,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
533 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { | 434 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { |
534 | if (scan >= nr_to_scan) | 435 | if (scan >= nr_to_scan) |
535 | break; | 436 | break; |
437 | if (unlikely(!PageCgroupUsed(pc))) | ||
438 | continue; | ||
536 | page = pc->page; | 439 | page = pc->page; |
537 | 440 | ||
538 | if (unlikely(!PageLRU(page))) | 441 | if (unlikely(!PageLRU(page))) |
@@ -576,26 +479,27 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
576 | { | 479 | { |
577 | struct mem_cgroup *mem; | 480 | struct mem_cgroup *mem; |
578 | struct page_cgroup *pc; | 481 | struct page_cgroup *pc; |
579 | unsigned long flags; | ||
580 | unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 482 | unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
581 | struct mem_cgroup_per_zone *mz; | 483 | struct mem_cgroup_per_zone *mz; |
484 | unsigned long flags; | ||
582 | 485 | ||
583 | pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); | 486 | pc = lookup_page_cgroup(page); |
584 | if (unlikely(pc == NULL)) | 487 | /* can happen at boot */ |
585 | goto err; | 488 | if (unlikely(!pc)) |
586 | 489 | return 0; | |
490 | prefetchw(pc); | ||
587 | /* | 491 | /* |
588 | * We always charge the cgroup the mm_struct belongs to. | 492 | * We always charge the cgroup the mm_struct belongs to. |
589 | * The mm_struct's mem_cgroup changes on task migration if the | 493 | * The mm_struct's mem_cgroup changes on task migration if the |
590 | * thread group leader migrates. It's possible that mm is not | 494 | * thread group leader migrates. It's possible that mm is not |
591 | * set, if so charge the init_mm (happens for pagecache usage). | 495 | * set, if so charge the init_mm (happens for pagecache usage). |
592 | */ | 496 | */ |
497 | |||
593 | if (likely(!memcg)) { | 498 | if (likely(!memcg)) { |
594 | rcu_read_lock(); | 499 | rcu_read_lock(); |
595 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 500 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
596 | if (unlikely(!mem)) { | 501 | if (unlikely(!mem)) { |
597 | rcu_read_unlock(); | 502 | rcu_read_unlock(); |
598 | kmem_cache_free(page_cgroup_cache, pc); | ||
599 | return 0; | 503 | return 0; |
600 | } | 504 | } |
601 | /* | 505 | /* |
@@ -631,36 +535,33 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
631 | } | 535 | } |
632 | } | 536 | } |
633 | 537 | ||
538 | |||
539 | lock_page_cgroup(pc); | ||
540 | if (unlikely(PageCgroupUsed(pc))) { | ||
541 | unlock_page_cgroup(pc); | ||
542 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
543 | css_put(&mem->css); | ||
544 | |||
545 | goto done; | ||
546 | } | ||
634 | pc->mem_cgroup = mem; | 547 | pc->mem_cgroup = mem; |
635 | pc->page = page; | ||
636 | /* | 548 | /* |
637 | * If a page is accounted as a page cache, insert to inactive list. | 549 | * If a page is accounted as a page cache, insert to inactive list. |
638 | * If anon, insert to active list. | 550 | * If anon, insert to active list. |
639 | */ | 551 | */ |
640 | pc->flags = pcg_default_flags[ctype]; | 552 | pc->flags = pcg_default_flags[ctype]; |
641 | 553 | ||
642 | lock_page_cgroup(page); | ||
643 | if (unlikely(page_get_page_cgroup(page))) { | ||
644 | unlock_page_cgroup(page); | ||
645 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
646 | css_put(&mem->css); | ||
647 | kmem_cache_free(page_cgroup_cache, pc); | ||
648 | goto done; | ||
649 | } | ||
650 | page_assign_page_cgroup(page, pc); | ||
651 | |||
652 | mz = page_cgroup_zoneinfo(pc); | 554 | mz = page_cgroup_zoneinfo(pc); |
555 | |||
653 | spin_lock_irqsave(&mz->lru_lock, flags); | 556 | spin_lock_irqsave(&mz->lru_lock, flags); |
654 | __mem_cgroup_add_list(mz, pc); | 557 | __mem_cgroup_add_list(mz, pc); |
655 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 558 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
559 | unlock_page_cgroup(pc); | ||
656 | 560 | ||
657 | unlock_page_cgroup(page); | ||
658 | done: | 561 | done: |
659 | return 0; | 562 | return 0; |
660 | out: | 563 | out: |
661 | css_put(&mem->css); | 564 | css_put(&mem->css); |
662 | kmem_cache_free(page_cgroup_cache, pc); | ||
663 | err: | ||
664 | return -ENOMEM; | 565 | return -ENOMEM; |
665 | } | 566 | } |
666 | 567 | ||
@@ -668,7 +569,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) | |||
668 | { | 569 | { |
669 | if (mem_cgroup_subsys.disabled) | 570 | if (mem_cgroup_subsys.disabled) |
670 | return 0; | 571 | return 0; |
671 | 572 | if (PageCompound(page)) | |
573 | return 0; | ||
672 | /* | 574 | /* |
673 | * If already mapped, we don't have to account. | 575 | * If already mapped, we don't have to account. |
674 | * If page cache, page->mapping has address_space. | 576 | * If page cache, page->mapping has address_space. |
@@ -689,7 +591,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
689 | { | 591 | { |
690 | if (mem_cgroup_subsys.disabled) | 592 | if (mem_cgroup_subsys.disabled) |
691 | return 0; | 593 | return 0; |
692 | 594 | if (PageCompound(page)) | |
595 | return 0; | ||
693 | /* | 596 | /* |
694 | * Corner case handling. This is called from add_to_page_cache() | 597 | * Corner case handling. This is called from add_to_page_cache() |
695 | * in usual. But some FS (shmem) precharges this page before calling it | 598 | * in usual. But some FS (shmem) precharges this page before calling it |
@@ -702,15 +605,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
702 | if (!(gfp_mask & __GFP_WAIT)) { | 605 | if (!(gfp_mask & __GFP_WAIT)) { |
703 | struct page_cgroup *pc; | 606 | struct page_cgroup *pc; |
704 | 607 | ||
705 | lock_page_cgroup(page); | 608 | |
706 | pc = page_get_page_cgroup(page); | 609 | pc = lookup_page_cgroup(page); |
707 | if (pc) { | 610 | if (!pc) |
708 | VM_BUG_ON(pc->page != page); | 611 | return 0; |
709 | VM_BUG_ON(!pc->mem_cgroup); | 612 | lock_page_cgroup(pc); |
710 | unlock_page_cgroup(page); | 613 | if (PageCgroupUsed(pc)) { |
614 | unlock_page_cgroup(pc); | ||
711 | return 0; | 615 | return 0; |
712 | } | 616 | } |
713 | unlock_page_cgroup(page); | 617 | unlock_page_cgroup(pc); |
714 | } | 618 | } |
715 | 619 | ||
716 | if (unlikely(!mm)) | 620 | if (unlikely(!mm)) |
@@ -741,37 +645,39 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
741 | /* | 645 | /* |
742 | * Check if our page_cgroup is valid | 646 | * Check if our page_cgroup is valid |
743 | */ | 647 | */ |
744 | lock_page_cgroup(page); | 648 | pc = lookup_page_cgroup(page); |
745 | pc = page_get_page_cgroup(page); | 649 | if (unlikely(!pc || !PageCgroupUsed(pc))) |
746 | if (unlikely(!pc)) | 650 | return; |
747 | goto unlock; | ||
748 | |||
749 | VM_BUG_ON(pc->page != page); | ||
750 | 651 | ||
751 | if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) | 652 | lock_page_cgroup(pc); |
752 | && ((PageCgroupCache(pc) || page_mapped(page)))) | 653 | if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page)) |
753 | goto unlock; | 654 | || !PageCgroupUsed(pc)) { |
655 | /* This happens at race in zap_pte_range() and do_swap_page()*/ | ||
656 | unlock_page_cgroup(pc); | ||
657 | return; | ||
658 | } | ||
659 | ClearPageCgroupUsed(pc); | ||
660 | mem = pc->mem_cgroup; | ||
754 | 661 | ||
755 | mz = page_cgroup_zoneinfo(pc); | 662 | mz = page_cgroup_zoneinfo(pc); |
756 | spin_lock_irqsave(&mz->lru_lock, flags); | 663 | spin_lock_irqsave(&mz->lru_lock, flags); |
757 | __mem_cgroup_remove_list(mz, pc); | 664 | __mem_cgroup_remove_list(mz, pc); |
758 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 665 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
666 | unlock_page_cgroup(pc); | ||
759 | 667 | ||
760 | page_assign_page_cgroup(page, NULL); | ||
761 | unlock_page_cgroup(page); | ||
762 | |||
763 | mem = pc->mem_cgroup; | ||
764 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 668 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
765 | css_put(&mem->css); | 669 | css_put(&mem->css); |
766 | 670 | ||
767 | kmem_cache_free(page_cgroup_cache, pc); | ||
768 | return; | 671 | return; |
769 | unlock: | ||
770 | unlock_page_cgroup(page); | ||
771 | } | 672 | } |
772 | 673 | ||
773 | void mem_cgroup_uncharge_page(struct page *page) | 674 | void mem_cgroup_uncharge_page(struct page *page) |
774 | { | 675 | { |
676 | /* early check. */ | ||
677 | if (page_mapped(page)) | ||
678 | return; | ||
679 | if (page->mapping && !PageAnon(page)) | ||
680 | return; | ||
775 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); | 681 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); |
776 | } | 682 | } |
777 | 683 | ||
@@ -795,9 +701,9 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) | |||
795 | if (mem_cgroup_subsys.disabled) | 701 | if (mem_cgroup_subsys.disabled) |
796 | return 0; | 702 | return 0; |
797 | 703 | ||
798 | lock_page_cgroup(page); | 704 | pc = lookup_page_cgroup(page); |
799 | pc = page_get_page_cgroup(page); | 705 | lock_page_cgroup(pc); |
800 | if (pc) { | 706 | if (PageCgroupUsed(pc)) { |
801 | mem = pc->mem_cgroup; | 707 | mem = pc->mem_cgroup; |
802 | css_get(&mem->css); | 708 | css_get(&mem->css); |
803 | if (PageCgroupCache(pc)) { | 709 | if (PageCgroupCache(pc)) { |
@@ -807,7 +713,7 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) | |||
807 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 713 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
808 | } | 714 | } |
809 | } | 715 | } |
810 | unlock_page_cgroup(page); | 716 | unlock_page_cgroup(pc); |
811 | if (mem) { | 717 | if (mem) { |
812 | ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, | 718 | ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, |
813 | ctype, mem); | 719 | ctype, mem); |
@@ -832,7 +738,7 @@ void mem_cgroup_end_migration(struct page *newpage) | |||
832 | */ | 738 | */ |
833 | if (!newpage->mapping) | 739 | if (!newpage->mapping) |
834 | __mem_cgroup_uncharge_common(newpage, | 740 | __mem_cgroup_uncharge_common(newpage, |
835 | MEM_CGROUP_CHARGE_TYPE_FORCE); | 741 | MEM_CGROUP_CHARGE_TYPE_FORCE); |
836 | else if (PageAnon(newpage)) | 742 | else if (PageAnon(newpage)) |
837 | mem_cgroup_uncharge_page(newpage); | 743 | mem_cgroup_uncharge_page(newpage); |
838 | } | 744 | } |
@@ -918,6 +824,8 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
918 | while (!list_empty(list)) { | 824 | while (!list_empty(list)) { |
919 | pc = list_entry(list->prev, struct page_cgroup, lru); | 825 | pc = list_entry(list->prev, struct page_cgroup, lru); |
920 | page = pc->page; | 826 | page = pc->page; |
827 | if (!PageCgroupUsed(pc)) | ||
828 | break; | ||
921 | get_page(page); | 829 | get_page(page); |
922 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 830 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
923 | /* | 831 | /* |
@@ -932,8 +840,10 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
932 | count = FORCE_UNCHARGE_BATCH; | 840 | count = FORCE_UNCHARGE_BATCH; |
933 | cond_resched(); | 841 | cond_resched(); |
934 | } | 842 | } |
935 | } else | 843 | } else { |
936 | cond_resched(); | 844 | spin_lock_irqsave(&mz->lru_lock, flags); |
845 | break; | ||
846 | } | ||
937 | spin_lock_irqsave(&mz->lru_lock, flags); | 847 | spin_lock_irqsave(&mz->lru_lock, flags); |
938 | } | 848 | } |
939 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 849 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
@@ -957,6 +867,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem) | |||
957 | while (mem->res.usage > 0) { | 867 | while (mem->res.usage > 0) { |
958 | if (atomic_read(&mem->css.cgroup->count) > 0) | 868 | if (atomic_read(&mem->css.cgroup->count) > 0) |
959 | goto out; | 869 | goto out; |
870 | /* This is for making all *used* pages to be on LRU. */ | ||
871 | lru_add_drain_all(); | ||
960 | for_each_node_state(node, N_POSSIBLE) | 872 | for_each_node_state(node, N_POSSIBLE) |
961 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 873 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
962 | struct mem_cgroup_per_zone *mz; | 874 | struct mem_cgroup_per_zone *mz; |
@@ -965,6 +877,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem) | |||
965 | for_each_lru(l) | 877 | for_each_lru(l) |
966 | mem_cgroup_force_empty_list(mem, mz, l); | 878 | mem_cgroup_force_empty_list(mem, mz, l); |
967 | } | 879 | } |
880 | cond_resched(); | ||
968 | } | 881 | } |
969 | ret = 0; | 882 | ret = 0; |
970 | out: | 883 | out: |
@@ -1175,8 +1088,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1175 | int node; | 1088 | int node; |
1176 | 1089 | ||
1177 | if (unlikely((cont->parent) == NULL)) { | 1090 | if (unlikely((cont->parent) == NULL)) { |
1091 | page_cgroup_init(); | ||
1178 | mem = &init_mem_cgroup; | 1092 | mem = &init_mem_cgroup; |
1179 | page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC); | ||
1180 | } else { | 1093 | } else { |
1181 | mem = mem_cgroup_alloc(); | 1094 | mem = mem_cgroup_alloc(); |
1182 | if (!mem) | 1095 | if (!mem) |