diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2008-10-18 23:28:16 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-10-20 11:52:39 -0400 |
commit | 52d4b9ac0b985168009c2a57098324e67bae171f (patch) | |
tree | b3e3b854166930af893be90ea30a7ab0d65c59e7 /mm | |
parent | c05555b572921c464d064d9267f7f7bc06d424fa (diff) |
memcg: allocate all page_cgroup at boot
Allocate all page_cgroup at boot and remove page_cgroup poitner from
struct page. This patch adds an interface as
struct page_cgroup *lookup_page_cgroup(struct page*)
All FLATMEM/DISCONTIGMEM/SPARSEMEM and MEMORY_HOTPLUG is supported.
Remove page_cgroup pointer reduces the amount of memory by
- 4 bytes per PAGE_SIZE.
- 8 bytes per PAGE_SIZE
if memory controller is disabled. (even if configured.)
On usual 8GB x86-32 server, this saves 8MB of NORMAL_ZONE memory.
On my x86-64 server with 48GB of memory, this saves 96MB of memory.
I think this reduction makes sense.
By pre-allocation, kmalloc/kfree in charge/uncharge are removed.
This means
- we're not necessary to be afraid of kmalloc faiulre.
(this can happen because of gfp_mask type.)
- we can avoid calling kmalloc/kfree.
- we can avoid allocating tons of small objects which can be fragmented.
- we can know what amount of memory will be used for this extra-lru handling.
I added printk message as
"allocated %ld bytes of page_cgroup"
"please try cgroup_disable=memory option if you don't want"
maybe enough informative for users.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/memcontrol.c | 247 | ||||
-rw-r--r-- | mm/page_alloc.c | 12 | ||||
-rw-r--r-- | mm/page_cgroup.c | 237 |
4 files changed, 321 insertions, 178 deletions
diff --git a/mm/Makefile b/mm/Makefile index da4ccf015aea..c06b45a1ff5f 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -33,5 +33,4 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o | |||
33 | obj-$(CONFIG_MIGRATION) += migrate.o | 33 | obj-$(CONFIG_MIGRATION) += migrate.o |
34 | obj-$(CONFIG_SMP) += allocpercpu.o | 34 | obj-$(CONFIG_SMP) += allocpercpu.o |
35 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 35 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
36 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o | 36 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o |
37 | |||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 031682e7ef0c..d4a92b63e98e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -33,11 +33,11 @@ | |||
33 | #include <linux/seq_file.h> | 33 | #include <linux/seq_file.h> |
34 | #include <linux/vmalloc.h> | 34 | #include <linux/vmalloc.h> |
35 | #include <linux/mm_inline.h> | 35 | #include <linux/mm_inline.h> |
36 | #include <linux/page_cgroup.h> | ||
36 | 37 | ||
37 | #include <asm/uaccess.h> | 38 | #include <asm/uaccess.h> |
38 | 39 | ||
39 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 40 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
40 | static struct kmem_cache *page_cgroup_cache __read_mostly; | ||
41 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 41 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
42 | 42 | ||
43 | /* | 43 | /* |
@@ -135,79 +135,6 @@ struct mem_cgroup { | |||
135 | }; | 135 | }; |
136 | static struct mem_cgroup init_mem_cgroup; | 136 | static struct mem_cgroup init_mem_cgroup; |
137 | 137 | ||
138 | /* | ||
139 | * We use the lower bit of the page->page_cgroup pointer as a bit spin | ||
140 | * lock. We need to ensure that page->page_cgroup is at least two | ||
141 | * byte aligned (based on comments from Nick Piggin). But since | ||
142 | * bit_spin_lock doesn't actually set that lock bit in a non-debug | ||
143 | * uniprocessor kernel, we should avoid setting it here too. | ||
144 | */ | ||
145 | #define PAGE_CGROUP_LOCK_BIT 0x0 | ||
146 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) | ||
147 | #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) | ||
148 | #else | ||
149 | #define PAGE_CGROUP_LOCK 0x0 | ||
150 | #endif | ||
151 | |||
152 | /* | ||
153 | * A page_cgroup page is associated with every page descriptor. The | ||
154 | * page_cgroup helps us identify information about the cgroup | ||
155 | */ | ||
156 | struct page_cgroup { | ||
157 | struct list_head lru; /* per cgroup LRU list */ | ||
158 | struct page *page; | ||
159 | struct mem_cgroup *mem_cgroup; | ||
160 | unsigned long flags; | ||
161 | }; | ||
162 | |||
163 | enum { | ||
164 | /* flags for mem_cgroup */ | ||
165 | PCG_CACHE, /* charged as cache */ | ||
166 | /* flags for LRU placement */ | ||
167 | PCG_ACTIVE, /* page is active in this cgroup */ | ||
168 | PCG_FILE, /* page is file system backed */ | ||
169 | PCG_UNEVICTABLE, /* page is unevictableable */ | ||
170 | }; | ||
171 | |||
172 | #define TESTPCGFLAG(uname, lname) \ | ||
173 | static inline int PageCgroup##uname(struct page_cgroup *pc) \ | ||
174 | { return test_bit(PCG_##lname, &pc->flags); } | ||
175 | |||
176 | #define SETPCGFLAG(uname, lname) \ | ||
177 | static inline void SetPageCgroup##uname(struct page_cgroup *pc)\ | ||
178 | { set_bit(PCG_##lname, &pc->flags); } | ||
179 | |||
180 | #define CLEARPCGFLAG(uname, lname) \ | ||
181 | static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ | ||
182 | { clear_bit(PCG_##lname, &pc->flags); } | ||
183 | |||
184 | |||
185 | /* Cache flag is set only once (at allocation) */ | ||
186 | TESTPCGFLAG(Cache, CACHE) | ||
187 | |||
188 | /* LRU management flags (from global-lru definition) */ | ||
189 | TESTPCGFLAG(File, FILE) | ||
190 | SETPCGFLAG(File, FILE) | ||
191 | CLEARPCGFLAG(File, FILE) | ||
192 | |||
193 | TESTPCGFLAG(Active, ACTIVE) | ||
194 | SETPCGFLAG(Active, ACTIVE) | ||
195 | CLEARPCGFLAG(Active, ACTIVE) | ||
196 | |||
197 | TESTPCGFLAG(Unevictable, UNEVICTABLE) | ||
198 | SETPCGFLAG(Unevictable, UNEVICTABLE) | ||
199 | CLEARPCGFLAG(Unevictable, UNEVICTABLE) | ||
200 | |||
201 | static int page_cgroup_nid(struct page_cgroup *pc) | ||
202 | { | ||
203 | return page_to_nid(pc->page); | ||
204 | } | ||
205 | |||
206 | static enum zone_type page_cgroup_zid(struct page_cgroup *pc) | ||
207 | { | ||
208 | return page_zonenum(pc->page); | ||
209 | } | ||
210 | |||
211 | enum charge_type { | 138 | enum charge_type { |
212 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 139 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
213 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 140 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
@@ -216,12 +143,18 @@ enum charge_type { | |||
216 | NR_CHARGE_TYPE, | 143 | NR_CHARGE_TYPE, |
217 | }; | 144 | }; |
218 | 145 | ||
146 | /* only for here (for easy reading.) */ | ||
147 | #define PCGF_CACHE (1UL << PCG_CACHE) | ||
148 | #define PCGF_USED (1UL << PCG_USED) | ||
149 | #define PCGF_ACTIVE (1UL << PCG_ACTIVE) | ||
150 | #define PCGF_LOCK (1UL << PCG_LOCK) | ||
151 | #define PCGF_FILE (1UL << PCG_FILE) | ||
219 | static const unsigned long | 152 | static const unsigned long |
220 | pcg_default_flags[NR_CHARGE_TYPE] = { | 153 | pcg_default_flags[NR_CHARGE_TYPE] = { |
221 | ((1 << PCG_CACHE) | (1 << PCG_FILE)), | 154 | PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ |
222 | ((1 << PCG_ACTIVE)), | 155 | PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ |
223 | ((1 << PCG_ACTIVE) | (1 << PCG_CACHE)), | 156 | PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ |
224 | 0, | 157 | 0, /* FORCE */ |
225 | }; | 158 | }; |
226 | 159 | ||
227 | /* | 160 | /* |
@@ -303,37 +236,6 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
303 | struct mem_cgroup, css); | 236 | struct mem_cgroup, css); |
304 | } | 237 | } |
305 | 238 | ||
306 | static inline int page_cgroup_locked(struct page *page) | ||
307 | { | ||
308 | return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
309 | } | ||
310 | |||
311 | static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) | ||
312 | { | ||
313 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
314 | page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK); | ||
315 | } | ||
316 | |||
317 | struct page_cgroup *page_get_page_cgroup(struct page *page) | ||
318 | { | ||
319 | return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK); | ||
320 | } | ||
321 | |||
322 | static void lock_page_cgroup(struct page *page) | ||
323 | { | ||
324 | bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
325 | } | ||
326 | |||
327 | static int try_lock_page_cgroup(struct page *page) | ||
328 | { | ||
329 | return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
330 | } | ||
331 | |||
332 | static void unlock_page_cgroup(struct page *page) | ||
333 | { | ||
334 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
335 | } | ||
336 | |||
337 | static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, | 239 | static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, |
338 | struct page_cgroup *pc) | 240 | struct page_cgroup *pc) |
339 | { | 241 | { |
@@ -436,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, enum lru_list lru) | |||
436 | * safely get to page_cgroup without it, so just try_lock it: | 338 | * safely get to page_cgroup without it, so just try_lock it: |
437 | * mem_cgroup_isolate_pages allows for page left on wrong list. | 339 | * mem_cgroup_isolate_pages allows for page left on wrong list. |
438 | */ | 340 | */ |
439 | if (!try_lock_page_cgroup(page)) | 341 | pc = lookup_page_cgroup(page); |
342 | if (!trylock_page_cgroup(pc)) | ||
440 | return; | 343 | return; |
441 | 344 | if (pc && PageCgroupUsed(pc)) { | |
442 | pc = page_get_page_cgroup(page); | ||
443 | if (pc) { | ||
444 | mz = page_cgroup_zoneinfo(pc); | 345 | mz = page_cgroup_zoneinfo(pc); |
445 | spin_lock_irqsave(&mz->lru_lock, flags); | 346 | spin_lock_irqsave(&mz->lru_lock, flags); |
446 | __mem_cgroup_move_lists(pc, lru); | 347 | __mem_cgroup_move_lists(pc, lru); |
447 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 348 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
448 | } | 349 | } |
449 | unlock_page_cgroup(page); | 350 | unlock_page_cgroup(pc); |
450 | } | 351 | } |
451 | 352 | ||
452 | /* | 353 | /* |
@@ -533,6 +434,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
533 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { | 434 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { |
534 | if (scan >= nr_to_scan) | 435 | if (scan >= nr_to_scan) |
535 | break; | 436 | break; |
437 | if (unlikely(!PageCgroupUsed(pc))) | ||
438 | continue; | ||
536 | page = pc->page; | 439 | page = pc->page; |
537 | 440 | ||
538 | if (unlikely(!PageLRU(page))) | 441 | if (unlikely(!PageLRU(page))) |
@@ -576,26 +479,27 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
576 | { | 479 | { |
577 | struct mem_cgroup *mem; | 480 | struct mem_cgroup *mem; |
578 | struct page_cgroup *pc; | 481 | struct page_cgroup *pc; |
579 | unsigned long flags; | ||
580 | unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 482 | unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
581 | struct mem_cgroup_per_zone *mz; | 483 | struct mem_cgroup_per_zone *mz; |
484 | unsigned long flags; | ||
582 | 485 | ||
583 | pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); | 486 | pc = lookup_page_cgroup(page); |
584 | if (unlikely(pc == NULL)) | 487 | /* can happen at boot */ |
585 | goto err; | 488 | if (unlikely(!pc)) |
586 | 489 | return 0; | |
490 | prefetchw(pc); | ||
587 | /* | 491 | /* |
588 | * We always charge the cgroup the mm_struct belongs to. | 492 | * We always charge the cgroup the mm_struct belongs to. |
589 | * The mm_struct's mem_cgroup changes on task migration if the | 493 | * The mm_struct's mem_cgroup changes on task migration if the |
590 | * thread group leader migrates. It's possible that mm is not | 494 | * thread group leader migrates. It's possible that mm is not |
591 | * set, if so charge the init_mm (happens for pagecache usage). | 495 | * set, if so charge the init_mm (happens for pagecache usage). |
592 | */ | 496 | */ |
497 | |||
593 | if (likely(!memcg)) { | 498 | if (likely(!memcg)) { |
594 | rcu_read_lock(); | 499 | rcu_read_lock(); |
595 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 500 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
596 | if (unlikely(!mem)) { | 501 | if (unlikely(!mem)) { |
597 | rcu_read_unlock(); | 502 | rcu_read_unlock(); |
598 | kmem_cache_free(page_cgroup_cache, pc); | ||
599 | return 0; | 503 | return 0; |
600 | } | 504 | } |
601 | /* | 505 | /* |
@@ -631,36 +535,33 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
631 | } | 535 | } |
632 | } | 536 | } |
633 | 537 | ||
538 | |||
539 | lock_page_cgroup(pc); | ||
540 | if (unlikely(PageCgroupUsed(pc))) { | ||
541 | unlock_page_cgroup(pc); | ||
542 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
543 | css_put(&mem->css); | ||
544 | |||
545 | goto done; | ||
546 | } | ||
634 | pc->mem_cgroup = mem; | 547 | pc->mem_cgroup = mem; |
635 | pc->page = page; | ||
636 | /* | 548 | /* |
637 | * If a page is accounted as a page cache, insert to inactive list. | 549 | * If a page is accounted as a page cache, insert to inactive list. |
638 | * If anon, insert to active list. | 550 | * If anon, insert to active list. |
639 | */ | 551 | */ |
640 | pc->flags = pcg_default_flags[ctype]; | 552 | pc->flags = pcg_default_flags[ctype]; |
641 | 553 | ||
642 | lock_page_cgroup(page); | ||
643 | if (unlikely(page_get_page_cgroup(page))) { | ||
644 | unlock_page_cgroup(page); | ||
645 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
646 | css_put(&mem->css); | ||
647 | kmem_cache_free(page_cgroup_cache, pc); | ||
648 | goto done; | ||
649 | } | ||
650 | page_assign_page_cgroup(page, pc); | ||
651 | |||
652 | mz = page_cgroup_zoneinfo(pc); | 554 | mz = page_cgroup_zoneinfo(pc); |
555 | |||
653 | spin_lock_irqsave(&mz->lru_lock, flags); | 556 | spin_lock_irqsave(&mz->lru_lock, flags); |
654 | __mem_cgroup_add_list(mz, pc); | 557 | __mem_cgroup_add_list(mz, pc); |
655 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 558 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
559 | unlock_page_cgroup(pc); | ||
656 | 560 | ||
657 | unlock_page_cgroup(page); | ||
658 | done: | 561 | done: |
659 | return 0; | 562 | return 0; |
660 | out: | 563 | out: |
661 | css_put(&mem->css); | 564 | css_put(&mem->css); |
662 | kmem_cache_free(page_cgroup_cache, pc); | ||
663 | err: | ||
664 | return -ENOMEM; | 565 | return -ENOMEM; |
665 | } | 566 | } |
666 | 567 | ||
@@ -668,7 +569,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) | |||
668 | { | 569 | { |
669 | if (mem_cgroup_subsys.disabled) | 570 | if (mem_cgroup_subsys.disabled) |
670 | return 0; | 571 | return 0; |
671 | 572 | if (PageCompound(page)) | |
573 | return 0; | ||
672 | /* | 574 | /* |
673 | * If already mapped, we don't have to account. | 575 | * If already mapped, we don't have to account. |
674 | * If page cache, page->mapping has address_space. | 576 | * If page cache, page->mapping has address_space. |
@@ -689,7 +591,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
689 | { | 591 | { |
690 | if (mem_cgroup_subsys.disabled) | 592 | if (mem_cgroup_subsys.disabled) |
691 | return 0; | 593 | return 0; |
692 | 594 | if (PageCompound(page)) | |
595 | return 0; | ||
693 | /* | 596 | /* |
694 | * Corner case handling. This is called from add_to_page_cache() | 597 | * Corner case handling. This is called from add_to_page_cache() |
695 | * in usual. But some FS (shmem) precharges this page before calling it | 598 | * in usual. But some FS (shmem) precharges this page before calling it |
@@ -702,15 +605,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
702 | if (!(gfp_mask & __GFP_WAIT)) { | 605 | if (!(gfp_mask & __GFP_WAIT)) { |
703 | struct page_cgroup *pc; | 606 | struct page_cgroup *pc; |
704 | 607 | ||
705 | lock_page_cgroup(page); | 608 | |
706 | pc = page_get_page_cgroup(page); | 609 | pc = lookup_page_cgroup(page); |
707 | if (pc) { | 610 | if (!pc) |
708 | VM_BUG_ON(pc->page != page); | 611 | return 0; |
709 | VM_BUG_ON(!pc->mem_cgroup); | 612 | lock_page_cgroup(pc); |
710 | unlock_page_cgroup(page); | 613 | if (PageCgroupUsed(pc)) { |
614 | unlock_page_cgroup(pc); | ||
711 | return 0; | 615 | return 0; |
712 | } | 616 | } |
713 | unlock_page_cgroup(page); | 617 | unlock_page_cgroup(pc); |
714 | } | 618 | } |
715 | 619 | ||
716 | if (unlikely(!mm)) | 620 | if (unlikely(!mm)) |
@@ -741,37 +645,39 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
741 | /* | 645 | /* |
742 | * Check if our page_cgroup is valid | 646 | * Check if our page_cgroup is valid |
743 | */ | 647 | */ |
744 | lock_page_cgroup(page); | 648 | pc = lookup_page_cgroup(page); |
745 | pc = page_get_page_cgroup(page); | 649 | if (unlikely(!pc || !PageCgroupUsed(pc))) |
746 | if (unlikely(!pc)) | 650 | return; |
747 | goto unlock; | ||
748 | |||
749 | VM_BUG_ON(pc->page != page); | ||
750 | 651 | ||
751 | if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) | 652 | lock_page_cgroup(pc); |
752 | && ((PageCgroupCache(pc) || page_mapped(page)))) | 653 | if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page)) |
753 | goto unlock; | 654 | || !PageCgroupUsed(pc)) { |
655 | /* This happens at race in zap_pte_range() and do_swap_page()*/ | ||
656 | unlock_page_cgroup(pc); | ||
657 | return; | ||
658 | } | ||
659 | ClearPageCgroupUsed(pc); | ||
660 | mem = pc->mem_cgroup; | ||
754 | 661 | ||
755 | mz = page_cgroup_zoneinfo(pc); | 662 | mz = page_cgroup_zoneinfo(pc); |
756 | spin_lock_irqsave(&mz->lru_lock, flags); | 663 | spin_lock_irqsave(&mz->lru_lock, flags); |
757 | __mem_cgroup_remove_list(mz, pc); | 664 | __mem_cgroup_remove_list(mz, pc); |
758 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 665 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
666 | unlock_page_cgroup(pc); | ||
759 | 667 | ||
760 | page_assign_page_cgroup(page, NULL); | ||
761 | unlock_page_cgroup(page); | ||
762 | |||
763 | mem = pc->mem_cgroup; | ||
764 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 668 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
765 | css_put(&mem->css); | 669 | css_put(&mem->css); |
766 | 670 | ||
767 | kmem_cache_free(page_cgroup_cache, pc); | ||
768 | return; | 671 | return; |
769 | unlock: | ||
770 | unlock_page_cgroup(page); | ||
771 | } | 672 | } |
772 | 673 | ||
773 | void mem_cgroup_uncharge_page(struct page *page) | 674 | void mem_cgroup_uncharge_page(struct page *page) |
774 | { | 675 | { |
676 | /* early check. */ | ||
677 | if (page_mapped(page)) | ||
678 | return; | ||
679 | if (page->mapping && !PageAnon(page)) | ||
680 | return; | ||
775 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); | 681 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); |
776 | } | 682 | } |
777 | 683 | ||
@@ -795,9 +701,9 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) | |||
795 | if (mem_cgroup_subsys.disabled) | 701 | if (mem_cgroup_subsys.disabled) |
796 | return 0; | 702 | return 0; |
797 | 703 | ||
798 | lock_page_cgroup(page); | 704 | pc = lookup_page_cgroup(page); |
799 | pc = page_get_page_cgroup(page); | 705 | lock_page_cgroup(pc); |
800 | if (pc) { | 706 | if (PageCgroupUsed(pc)) { |
801 | mem = pc->mem_cgroup; | 707 | mem = pc->mem_cgroup; |
802 | css_get(&mem->css); | 708 | css_get(&mem->css); |
803 | if (PageCgroupCache(pc)) { | 709 | if (PageCgroupCache(pc)) { |
@@ -807,7 +713,7 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) | |||
807 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 713 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
808 | } | 714 | } |
809 | } | 715 | } |
810 | unlock_page_cgroup(page); | 716 | unlock_page_cgroup(pc); |
811 | if (mem) { | 717 | if (mem) { |
812 | ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, | 718 | ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, |
813 | ctype, mem); | 719 | ctype, mem); |
@@ -832,7 +738,7 @@ void mem_cgroup_end_migration(struct page *newpage) | |||
832 | */ | 738 | */ |
833 | if (!newpage->mapping) | 739 | if (!newpage->mapping) |
834 | __mem_cgroup_uncharge_common(newpage, | 740 | __mem_cgroup_uncharge_common(newpage, |
835 | MEM_CGROUP_CHARGE_TYPE_FORCE); | 741 | MEM_CGROUP_CHARGE_TYPE_FORCE); |
836 | else if (PageAnon(newpage)) | 742 | else if (PageAnon(newpage)) |
837 | mem_cgroup_uncharge_page(newpage); | 743 | mem_cgroup_uncharge_page(newpage); |
838 | } | 744 | } |
@@ -918,6 +824,8 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
918 | while (!list_empty(list)) { | 824 | while (!list_empty(list)) { |
919 | pc = list_entry(list->prev, struct page_cgroup, lru); | 825 | pc = list_entry(list->prev, struct page_cgroup, lru); |
920 | page = pc->page; | 826 | page = pc->page; |
827 | if (!PageCgroupUsed(pc)) | ||
828 | break; | ||
921 | get_page(page); | 829 | get_page(page); |
922 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 830 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
923 | /* | 831 | /* |
@@ -932,8 +840,10 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
932 | count = FORCE_UNCHARGE_BATCH; | 840 | count = FORCE_UNCHARGE_BATCH; |
933 | cond_resched(); | 841 | cond_resched(); |
934 | } | 842 | } |
935 | } else | 843 | } else { |
936 | cond_resched(); | 844 | spin_lock_irqsave(&mz->lru_lock, flags); |
845 | break; | ||
846 | } | ||
937 | spin_lock_irqsave(&mz->lru_lock, flags); | 847 | spin_lock_irqsave(&mz->lru_lock, flags); |
938 | } | 848 | } |
939 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 849 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
@@ -957,6 +867,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem) | |||
957 | while (mem->res.usage > 0) { | 867 | while (mem->res.usage > 0) { |
958 | if (atomic_read(&mem->css.cgroup->count) > 0) | 868 | if (atomic_read(&mem->css.cgroup->count) > 0) |
959 | goto out; | 869 | goto out; |
870 | /* This is for making all *used* pages to be on LRU. */ | ||
871 | lru_add_drain_all(); | ||
960 | for_each_node_state(node, N_POSSIBLE) | 872 | for_each_node_state(node, N_POSSIBLE) |
961 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 873 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
962 | struct mem_cgroup_per_zone *mz; | 874 | struct mem_cgroup_per_zone *mz; |
@@ -965,6 +877,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem) | |||
965 | for_each_lru(l) | 877 | for_each_lru(l) |
966 | mem_cgroup_force_empty_list(mem, mz, l); | 878 | mem_cgroup_force_empty_list(mem, mz, l); |
967 | } | 879 | } |
880 | cond_resched(); | ||
968 | } | 881 | } |
969 | ret = 0; | 882 | ret = 0; |
970 | out: | 883 | out: |
@@ -1175,8 +1088,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1175 | int node; | 1088 | int node; |
1176 | 1089 | ||
1177 | if (unlikely((cont->parent) == NULL)) { | 1090 | if (unlikely((cont->parent) == NULL)) { |
1091 | page_cgroup_init(); | ||
1178 | mem = &init_mem_cgroup; | 1092 | mem = &init_mem_cgroup; |
1179 | page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC); | ||
1180 | } else { | 1093 | } else { |
1181 | mem = mem_cgroup_alloc(); | 1094 | mem = mem_cgroup_alloc(); |
1182 | if (!mem) | 1095 | if (!mem) |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f2fc44ec1d44..d0a240fbb8bf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -44,7 +44,7 @@ | |||
44 | #include <linux/backing-dev.h> | 44 | #include <linux/backing-dev.h> |
45 | #include <linux/fault-inject.h> | 45 | #include <linux/fault-inject.h> |
46 | #include <linux/page-isolation.h> | 46 | #include <linux/page-isolation.h> |
47 | #include <linux/memcontrol.h> | 47 | #include <linux/page_cgroup.h> |
48 | #include <linux/debugobjects.h> | 48 | #include <linux/debugobjects.h> |
49 | 49 | ||
50 | #include <asm/tlbflush.h> | 50 | #include <asm/tlbflush.h> |
@@ -223,17 +223,12 @@ static inline int bad_range(struct zone *zone, struct page *page) | |||
223 | 223 | ||
224 | static void bad_page(struct page *page) | 224 | static void bad_page(struct page *page) |
225 | { | 225 | { |
226 | void *pc = page_get_page_cgroup(page); | ||
227 | |||
228 | printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG | 226 | printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG |
229 | "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", | 227 | "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", |
230 | current->comm, page, (int)(2*sizeof(unsigned long)), | 228 | current->comm, page, (int)(2*sizeof(unsigned long)), |
231 | (unsigned long)page->flags, page->mapping, | 229 | (unsigned long)page->flags, page->mapping, |
232 | page_mapcount(page), page_count(page)); | 230 | page_mapcount(page), page_count(page)); |
233 | if (pc) { | 231 | |
234 | printk(KERN_EMERG "cgroup:%p\n", pc); | ||
235 | page_reset_bad_cgroup(page); | ||
236 | } | ||
237 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" | 232 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" |
238 | KERN_EMERG "Backtrace:\n"); | 233 | KERN_EMERG "Backtrace:\n"); |
239 | dump_stack(); | 234 | dump_stack(); |
@@ -457,7 +452,6 @@ static inline int free_pages_check(struct page *page) | |||
457 | free_page_mlock(page); | 452 | free_page_mlock(page); |
458 | if (unlikely(page_mapcount(page) | | 453 | if (unlikely(page_mapcount(page) | |
459 | (page->mapping != NULL) | | 454 | (page->mapping != NULL) | |
460 | (page_get_page_cgroup(page) != NULL) | | ||
461 | (page_count(page) != 0) | | 455 | (page_count(page) != 0) | |
462 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) | 456 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) |
463 | bad_page(page); | 457 | bad_page(page); |
@@ -603,7 +597,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
603 | { | 597 | { |
604 | if (unlikely(page_mapcount(page) | | 598 | if (unlikely(page_mapcount(page) | |
605 | (page->mapping != NULL) | | 599 | (page->mapping != NULL) | |
606 | (page_get_page_cgroup(page) != NULL) | | ||
607 | (page_count(page) != 0) | | 600 | (page_count(page) != 0) | |
608 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) | 601 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) |
609 | bad_page(page); | 602 | bad_page(page); |
@@ -3438,6 +3431,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
3438 | pgdat->nr_zones = 0; | 3431 | pgdat->nr_zones = 0; |
3439 | init_waitqueue_head(&pgdat->kswapd_wait); | 3432 | init_waitqueue_head(&pgdat->kswapd_wait); |
3440 | pgdat->kswapd_max_order = 0; | 3433 | pgdat->kswapd_max_order = 0; |
3434 | pgdat_page_cgroup_init(pgdat); | ||
3441 | 3435 | ||
3442 | for (j = 0; j < MAX_NR_ZONES; j++) { | 3436 | for (j = 0; j < MAX_NR_ZONES; j++) { |
3443 | struct zone *zone = pgdat->node_zones + j; | 3437 | struct zone *zone = pgdat->node_zones + j; |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c new file mode 100644 index 000000000000..5d86550701f2 --- /dev/null +++ b/mm/page_cgroup.c | |||
@@ -0,0 +1,237 @@ | |||
1 | #include <linux/mm.h> | ||
2 | #include <linux/mmzone.h> | ||
3 | #include <linux/bootmem.h> | ||
4 | #include <linux/bit_spinlock.h> | ||
5 | #include <linux/page_cgroup.h> | ||
6 | #include <linux/hash.h> | ||
7 | #include <linux/memory.h> | ||
8 | |||
9 | static void __meminit | ||
10 | __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) | ||
11 | { | ||
12 | pc->flags = 0; | ||
13 | pc->mem_cgroup = NULL; | ||
14 | pc->page = pfn_to_page(pfn); | ||
15 | } | ||
16 | static unsigned long total_usage; | ||
17 | |||
18 | #if !defined(CONFIG_SPARSEMEM) | ||
19 | |||
20 | |||
21 | void __init pgdat_page_cgroup_init(struct pglist_data *pgdat) | ||
22 | { | ||
23 | pgdat->node_page_cgroup = NULL; | ||
24 | } | ||
25 | |||
26 | struct page_cgroup *lookup_page_cgroup(struct page *page) | ||
27 | { | ||
28 | unsigned long pfn = page_to_pfn(page); | ||
29 | unsigned long offset; | ||
30 | struct page_cgroup *base; | ||
31 | |||
32 | base = NODE_DATA(page_to_nid(page))->node_page_cgroup; | ||
33 | if (unlikely(!base)) | ||
34 | return NULL; | ||
35 | |||
36 | offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; | ||
37 | return base + offset; | ||
38 | } | ||
39 | |||
40 | static int __init alloc_node_page_cgroup(int nid) | ||
41 | { | ||
42 | struct page_cgroup *base, *pc; | ||
43 | unsigned long table_size; | ||
44 | unsigned long start_pfn, nr_pages, index; | ||
45 | |||
46 | start_pfn = NODE_DATA(nid)->node_start_pfn; | ||
47 | nr_pages = NODE_DATA(nid)->node_spanned_pages; | ||
48 | |||
49 | table_size = sizeof(struct page_cgroup) * nr_pages; | ||
50 | |||
51 | base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), | ||
52 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | ||
53 | if (!base) | ||
54 | return -ENOMEM; | ||
55 | for (index = 0; index < nr_pages; index++) { | ||
56 | pc = base + index; | ||
57 | __init_page_cgroup(pc, start_pfn + index); | ||
58 | } | ||
59 | NODE_DATA(nid)->node_page_cgroup = base; | ||
60 | total_usage += table_size; | ||
61 | return 0; | ||
62 | } | ||
63 | |||
64 | void __init page_cgroup_init(void) | ||
65 | { | ||
66 | |||
67 | int nid, fail; | ||
68 | |||
69 | for_each_online_node(nid) { | ||
70 | fail = alloc_node_page_cgroup(nid); | ||
71 | if (fail) | ||
72 | goto fail; | ||
73 | } | ||
74 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | ||
75 | printk(KERN_INFO "please try cgroup_disable=memory option if you" | ||
76 | " don't want\n"); | ||
77 | return; | ||
78 | fail: | ||
79 | printk(KERN_CRIT "allocation of page_cgroup was failed.\n"); | ||
80 | printk(KERN_CRIT "please try cgroup_disable=memory boot option\n"); | ||
81 | panic("Out of memory"); | ||
82 | } | ||
83 | |||
84 | #else /* CONFIG_FLAT_NODE_MEM_MAP */ | ||
85 | |||
86 | struct page_cgroup *lookup_page_cgroup(struct page *page) | ||
87 | { | ||
88 | unsigned long pfn = page_to_pfn(page); | ||
89 | struct mem_section *section = __pfn_to_section(pfn); | ||
90 | |||
91 | return section->page_cgroup + pfn; | ||
92 | } | ||
93 | |||
94 | int __meminit init_section_page_cgroup(unsigned long pfn) | ||
95 | { | ||
96 | struct mem_section *section; | ||
97 | struct page_cgroup *base, *pc; | ||
98 | unsigned long table_size; | ||
99 | int nid, index; | ||
100 | |||
101 | section = __pfn_to_section(pfn); | ||
102 | |||
103 | if (section->page_cgroup) | ||
104 | return 0; | ||
105 | |||
106 | nid = page_to_nid(pfn_to_page(pfn)); | ||
107 | |||
108 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | ||
109 | base = kmalloc_node(table_size, GFP_KERNEL, nid); | ||
110 | if (!base) | ||
111 | base = vmalloc_node(table_size, nid); | ||
112 | |||
113 | if (!base) { | ||
114 | printk(KERN_ERR "page cgroup allocation failure\n"); | ||
115 | return -ENOMEM; | ||
116 | } | ||
117 | |||
118 | for (index = 0; index < PAGES_PER_SECTION; index++) { | ||
119 | pc = base + index; | ||
120 | __init_page_cgroup(pc, pfn + index); | ||
121 | } | ||
122 | |||
123 | section = __pfn_to_section(pfn); | ||
124 | section->page_cgroup = base - pfn; | ||
125 | total_usage += table_size; | ||
126 | return 0; | ||
127 | } | ||
128 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
129 | void __free_page_cgroup(unsigned long pfn) | ||
130 | { | ||
131 | struct mem_section *ms; | ||
132 | struct page_cgroup *base; | ||
133 | |||
134 | ms = __pfn_to_section(pfn); | ||
135 | if (!ms || !ms->page_cgroup) | ||
136 | return; | ||
137 | base = ms->page_cgroup + pfn; | ||
138 | ms->page_cgroup = NULL; | ||
139 | if (is_vmalloc_addr(base)) | ||
140 | vfree(base); | ||
141 | else | ||
142 | kfree(base); | ||
143 | } | ||
144 | |||
145 | int online_page_cgroup(unsigned long start_pfn, | ||
146 | unsigned long nr_pages, | ||
147 | int nid) | ||
148 | { | ||
149 | unsigned long start, end, pfn; | ||
150 | int fail = 0; | ||
151 | |||
152 | start = start_pfn & (PAGES_PER_SECTION - 1); | ||
153 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); | ||
154 | |||
155 | for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { | ||
156 | if (!pfn_present(pfn)) | ||
157 | continue; | ||
158 | fail = init_section_page_cgroup(pfn); | ||
159 | } | ||
160 | if (!fail) | ||
161 | return 0; | ||
162 | |||
163 | /* rollback */ | ||
164 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | ||
165 | __free_page_cgroup(pfn); | ||
166 | |||
167 | return -ENOMEM; | ||
168 | } | ||
169 | |||
170 | int offline_page_cgroup(unsigned long start_pfn, | ||
171 | unsigned long nr_pages, int nid) | ||
172 | { | ||
173 | unsigned long start, end, pfn; | ||
174 | |||
175 | start = start_pfn & (PAGES_PER_SECTION - 1); | ||
176 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); | ||
177 | |||
178 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | ||
179 | __free_page_cgroup(pfn); | ||
180 | return 0; | ||
181 | |||
182 | } | ||
183 | |||
184 | static int page_cgroup_callback(struct notifier_block *self, | ||
185 | unsigned long action, void *arg) | ||
186 | { | ||
187 | struct memory_notify *mn = arg; | ||
188 | int ret = 0; | ||
189 | switch (action) { | ||
190 | case MEM_GOING_ONLINE: | ||
191 | ret = online_page_cgroup(mn->start_pfn, | ||
192 | mn->nr_pages, mn->status_change_nid); | ||
193 | break; | ||
194 | case MEM_CANCEL_ONLINE: | ||
195 | case MEM_OFFLINE: | ||
196 | offline_page_cgroup(mn->start_pfn, | ||
197 | mn->nr_pages, mn->status_change_nid); | ||
198 | break; | ||
199 | case MEM_GOING_OFFLINE: | ||
200 | break; | ||
201 | case MEM_ONLINE: | ||
202 | case MEM_CANCEL_OFFLINE: | ||
203 | break; | ||
204 | } | ||
205 | ret = notifier_from_errno(ret); | ||
206 | return ret; | ||
207 | } | ||
208 | |||
209 | #endif | ||
210 | |||
211 | void __init page_cgroup_init(void) | ||
212 | { | ||
213 | unsigned long pfn; | ||
214 | int fail = 0; | ||
215 | |||
216 | for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { | ||
217 | if (!pfn_present(pfn)) | ||
218 | continue; | ||
219 | fail = init_section_page_cgroup(pfn); | ||
220 | } | ||
221 | if (fail) { | ||
222 | printk(KERN_CRIT "try cgroup_disable=memory boot option\n"); | ||
223 | panic("Out of memory"); | ||
224 | } else { | ||
225 | hotplug_memory_notifier(page_cgroup_callback, 0); | ||
226 | } | ||
227 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | ||
228 | printk(KERN_INFO "please try cgroup_disable=memory option if you don't" | ||
229 | " want\n"); | ||
230 | } | ||
231 | |||
232 | void __init pgdat_page_cgroup_init(struct pglist_data *pgdat) | ||
233 | { | ||
234 | return; | ||
235 | } | ||
236 | |||
237 | #endif | ||