diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2010-08-10 21:03:02 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-08-11 11:59:19 -0400 |
commit | f75ca962037ffd639a44fd88933cd9b84c4c4411 (patch) | |
tree | 4773cd57ae45831c11783cd355f9dcb516eb66f0 | |
parent | 158e0a2d1b3cffed8b46cbc56393a1394672ef79 (diff) |
memcg: avoid css_get()
Now, memory cgroup increments css(cgroup subsys state)'s reference count
per a charged page. And the reference count is kept until the page is
uncharged. But this has 2 bad effect.
1. Because css_get/put calls atomic_inc()/dec, heavy call of them
on large smp will not scale well.
2. Because css's refcnt cannot be in a state as "ready-to-release",
cgroup's notify_on_release handler can't work with memcg.
3. css's refcnt is atomic_t, it means smaller than 32bit. Maybe too small.
This has been a problem since the 1st merge of memcg.
This is a trial to remove css's refcnt per a page. Even if we remove
refcnt, pre_destroy() does enough synchronization as
- check res->usage == 0.
- check no pages on LRU.
This patch removes css's refcnt per page. Even after this patch, at the
1st look, it seems css_get() is still called in try_charge().
But the logic is.
- If a memcg of mm->owner is cached one, consume_stock() will work.
At success, return immediately.
- If consume_stock returns false, css_get() is called and go to
slow path which may be blocked. At the end of slow path,
css_put() is called and restart from the start if necessary.
So, in the fast path, we don't call css_get() and can avoid access to
shared counter. This patch can make the most possible case fast.
Here is a result of multi-threaded page fault benchmark.
[Before]
25.32% multi-fault-all [kernel.kallsyms] [k] clear_page_c
9.30% multi-fault-all [kernel.kallsyms] [k] _raw_spin_lock_irqsave
8.02% multi-fault-all [kernel.kallsyms] [k] try_get_mem_cgroup_from_mm <=====(*)
7.83% multi-fault-all [kernel.kallsyms] [k] down_read_trylock
5.38% multi-fault-all [kernel.kallsyms] [k] __css_put
5.29% multi-fault-all [kernel.kallsyms] [k] __alloc_pages_nodemask
4.92% multi-fault-all [kernel.kallsyms] [k] _raw_spin_lock_irq
4.24% multi-fault-all [kernel.kallsyms] [k] up_read
3.53% multi-fault-all [kernel.kallsyms] [k] css_put
2.11% multi-fault-all [kernel.kallsyms] [k] handle_mm_fault
1.76% multi-fault-all [kernel.kallsyms] [k] __rmqueue
1.64% multi-fault-all [kernel.kallsyms] [k] __mem_cgroup_commit_charge
[After]
28.41% multi-fault-all [kernel.kallsyms] [k] clear_page_c
10.08% multi-fault-all [kernel.kallsyms] [k] _raw_spin_lock_irq
9.58% multi-fault-all [kernel.kallsyms] [k] down_read_trylock
9.38% multi-fault-all [kernel.kallsyms] [k] _raw_spin_lock_irqsave
5.86% multi-fault-all [kernel.kallsyms] [k] __alloc_pages_nodemask
5.65% multi-fault-all [kernel.kallsyms] [k] up_read
2.82% multi-fault-all [kernel.kallsyms] [k] handle_mm_fault
2.64% multi-fault-all [kernel.kallsyms] [k] mem_cgroup_add_lru_list
2.48% multi-fault-all [kernel.kallsyms] [k] __mem_cgroup_commit_charge
Then, 8.02% of try_get_mem_cgroup_from_mm() disappears because this patch
removes css_tryget() in it. (But yes, this is an extreme case.)
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/memcontrol.c | 119 |
1 files changed, 76 insertions, 43 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f52b0a1861c4..a1c3c317a4dd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1714,28 +1714,66 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1714 | * thread group leader migrates. It's possible that mm is not | 1714 | * thread group leader migrates. It's possible that mm is not |
1715 | * set, if so charge the init_mm (happens for pagecache usage). | 1715 | * set, if so charge the init_mm (happens for pagecache usage). |
1716 | */ | 1716 | */ |
1717 | if (*memcg) { | 1717 | if (!*memcg && !mm) |
1718 | goto bypass; | ||
1719 | again: | ||
1720 | if (*memcg) { /* css should be a valid one */ | ||
1718 | mem = *memcg; | 1721 | mem = *memcg; |
1722 | VM_BUG_ON(css_is_removed(&mem->css)); | ||
1723 | if (mem_cgroup_is_root(mem)) | ||
1724 | goto done; | ||
1725 | if (consume_stock(mem)) | ||
1726 | goto done; | ||
1719 | css_get(&mem->css); | 1727 | css_get(&mem->css); |
1720 | } else { | 1728 | } else { |
1721 | mem = try_get_mem_cgroup_from_mm(mm); | 1729 | struct task_struct *p; |
1722 | if (unlikely(!mem)) | ||
1723 | return 0; | ||
1724 | *memcg = mem; | ||
1725 | } | ||
1726 | 1730 | ||
1727 | VM_BUG_ON(css_is_removed(&mem->css)); | 1731 | rcu_read_lock(); |
1728 | if (mem_cgroup_is_root(mem)) | 1732 | p = rcu_dereference(mm->owner); |
1729 | goto done; | 1733 | VM_BUG_ON(!p); |
1734 | /* | ||
1735 | * because we don't have task_lock(), "p" can exit while | ||
1736 | * we're here. In that case, "mem" can point to root | ||
1737 | * cgroup but never be NULL. (and task_struct itself is freed | ||
1738 | * by RCU, cgroup itself is RCU safe.) Then, we have small | ||
1739 | * risk here to get wrong cgroup. But such kind of mis-account | ||
1740 | * by race always happens because we don't have cgroup_mutex(). | ||
1741 | * It's overkill and we allow that small race, here. | ||
1742 | */ | ||
1743 | mem = mem_cgroup_from_task(p); | ||
1744 | VM_BUG_ON(!mem); | ||
1745 | if (mem_cgroup_is_root(mem)) { | ||
1746 | rcu_read_unlock(); | ||
1747 | goto done; | ||
1748 | } | ||
1749 | if (consume_stock(mem)) { | ||
1750 | /* | ||
1751 | * It seems dagerous to access memcg without css_get(). | ||
1752 | * But considering how consume_stok works, it's not | ||
1753 | * necessary. If consume_stock success, some charges | ||
1754 | * from this memcg are cached on this cpu. So, we | ||
1755 | * don't need to call css_get()/css_tryget() before | ||
1756 | * calling consume_stock(). | ||
1757 | */ | ||
1758 | rcu_read_unlock(); | ||
1759 | goto done; | ||
1760 | } | ||
1761 | /* after here, we may be blocked. we need to get refcnt */ | ||
1762 | if (!css_tryget(&mem->css)) { | ||
1763 | rcu_read_unlock(); | ||
1764 | goto again; | ||
1765 | } | ||
1766 | rcu_read_unlock(); | ||
1767 | } | ||
1730 | 1768 | ||
1731 | do { | 1769 | do { |
1732 | bool oom_check; | 1770 | bool oom_check; |
1733 | 1771 | ||
1734 | if (consume_stock(mem)) | ||
1735 | goto done; /* don't need to fill stock */ | ||
1736 | /* If killed, bypass charge */ | 1772 | /* If killed, bypass charge */ |
1737 | if (fatal_signal_pending(current)) | 1773 | if (fatal_signal_pending(current)) { |
1774 | css_put(&mem->css); | ||
1738 | goto bypass; | 1775 | goto bypass; |
1776 | } | ||
1739 | 1777 | ||
1740 | oom_check = false; | 1778 | oom_check = false; |
1741 | if (oom && !nr_oom_retries) { | 1779 | if (oom && !nr_oom_retries) { |
@@ -1750,30 +1788,36 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1750 | break; | 1788 | break; |
1751 | case CHARGE_RETRY: /* not in OOM situation but retry */ | 1789 | case CHARGE_RETRY: /* not in OOM situation but retry */ |
1752 | csize = PAGE_SIZE; | 1790 | csize = PAGE_SIZE; |
1753 | break; | 1791 | css_put(&mem->css); |
1792 | mem = NULL; | ||
1793 | goto again; | ||
1754 | case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ | 1794 | case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ |
1795 | css_put(&mem->css); | ||
1755 | goto nomem; | 1796 | goto nomem; |
1756 | case CHARGE_NOMEM: /* OOM routine works */ | 1797 | case CHARGE_NOMEM: /* OOM routine works */ |
1757 | if (!oom) | 1798 | if (!oom) { |
1799 | css_put(&mem->css); | ||
1758 | goto nomem; | 1800 | goto nomem; |
1801 | } | ||
1759 | /* If oom, we never return -ENOMEM */ | 1802 | /* If oom, we never return -ENOMEM */ |
1760 | nr_oom_retries--; | 1803 | nr_oom_retries--; |
1761 | break; | 1804 | break; |
1762 | case CHARGE_OOM_DIE: /* Killed by OOM Killer */ | 1805 | case CHARGE_OOM_DIE: /* Killed by OOM Killer */ |
1806 | css_put(&mem->css); | ||
1763 | goto bypass; | 1807 | goto bypass; |
1764 | } | 1808 | } |
1765 | } while (ret != CHARGE_OK); | 1809 | } while (ret != CHARGE_OK); |
1766 | 1810 | ||
1767 | if (csize > PAGE_SIZE) | 1811 | if (csize > PAGE_SIZE) |
1768 | refill_stock(mem, csize - PAGE_SIZE); | 1812 | refill_stock(mem, csize - PAGE_SIZE); |
1813 | css_put(&mem->css); | ||
1769 | done: | 1814 | done: |
1815 | *memcg = mem; | ||
1770 | return 0; | 1816 | return 0; |
1771 | nomem: | 1817 | nomem: |
1772 | css_put(&mem->css); | 1818 | *memcg = NULL; |
1773 | return -ENOMEM; | 1819 | return -ENOMEM; |
1774 | bypass: | 1820 | bypass: |
1775 | if (mem) | ||
1776 | css_put(&mem->css); | ||
1777 | *memcg = NULL; | 1821 | *memcg = NULL; |
1778 | return 0; | 1822 | return 0; |
1779 | } | 1823 | } |
@@ -1790,11 +1834,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, | |||
1790 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | 1834 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); |
1791 | if (do_swap_account) | 1835 | if (do_swap_account) |
1792 | res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); | 1836 | res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); |
1793 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
1794 | WARN_ON_ONCE(count > INT_MAX); | ||
1795 | __css_put(&mem->css, (int)count); | ||
1796 | } | 1837 | } |
1797 | /* we don't need css_put for root */ | ||
1798 | } | 1838 | } |
1799 | 1839 | ||
1800 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | 1840 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) |
@@ -2155,7 +2195,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2155 | goto charge_cur_mm; | 2195 | goto charge_cur_mm; |
2156 | *ptr = mem; | 2196 | *ptr = mem; |
2157 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); | 2197 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); |
2158 | /* drop extra refcnt from tryget */ | ||
2159 | css_put(&mem->css); | 2198 | css_put(&mem->css); |
2160 | return ret; | 2199 | return ret; |
2161 | charge_cur_mm: | 2200 | charge_cur_mm: |
@@ -2325,10 +2364,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2325 | break; | 2364 | break; |
2326 | } | 2365 | } |
2327 | 2366 | ||
2328 | if (!mem_cgroup_is_root(mem)) | ||
2329 | __do_uncharge(mem, ctype); | ||
2330 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
2331 | mem_cgroup_swap_statistics(mem, true); | ||
2332 | mem_cgroup_charge_statistics(mem, pc, false); | 2367 | mem_cgroup_charge_statistics(mem, pc, false); |
2333 | 2368 | ||
2334 | ClearPageCgroupUsed(pc); | 2369 | ClearPageCgroupUsed(pc); |
@@ -2340,11 +2375,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2340 | */ | 2375 | */ |
2341 | 2376 | ||
2342 | unlock_page_cgroup(pc); | 2377 | unlock_page_cgroup(pc); |
2343 | 2378 | /* | |
2379 | * even after unlock, we have mem->res.usage here and this memcg | ||
2380 | * will never be freed. | ||
2381 | */ | ||
2344 | memcg_check_events(mem, page); | 2382 | memcg_check_events(mem, page); |
2345 | /* at swapout, this memcg will be accessed to record to swap */ | 2383 | if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { |
2346 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2384 | mem_cgroup_swap_statistics(mem, true); |
2347 | css_put(&mem->css); | 2385 | mem_cgroup_get(mem); |
2386 | } | ||
2387 | if (!mem_cgroup_is_root(mem)) | ||
2388 | __do_uncharge(mem, ctype); | ||
2348 | 2389 | ||
2349 | return mem; | 2390 | return mem; |
2350 | 2391 | ||
@@ -2431,13 +2472,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
2431 | 2472 | ||
2432 | memcg = __mem_cgroup_uncharge_common(page, ctype); | 2473 | memcg = __mem_cgroup_uncharge_common(page, ctype); |
2433 | 2474 | ||
2434 | /* record memcg information */ | 2475 | /* |
2435 | if (do_swap_account && swapout && memcg) { | 2476 | * record memcg information, if swapout && memcg != NULL, |
2477 | * mem_cgroup_get() was called in uncharge(). | ||
2478 | */ | ||
2479 | if (do_swap_account && swapout && memcg) | ||
2436 | swap_cgroup_record(ent, css_id(&memcg->css)); | 2480 | swap_cgroup_record(ent, css_id(&memcg->css)); |
2437 | mem_cgroup_get(memcg); | ||
2438 | } | ||
2439 | if (swapout && memcg) | ||
2440 | css_put(&memcg->css); | ||
2441 | } | 2481 | } |
2442 | #endif | 2482 | #endif |
2443 | 2483 | ||
@@ -2515,7 +2555,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
2515 | */ | 2555 | */ |
2516 | if (!mem_cgroup_is_root(to)) | 2556 | if (!mem_cgroup_is_root(to)) |
2517 | res_counter_uncharge(&to->res, PAGE_SIZE); | 2557 | res_counter_uncharge(&to->res, PAGE_SIZE); |
2518 | css_put(&to->css); | ||
2519 | } | 2558 | } |
2520 | return 0; | 2559 | return 0; |
2521 | } | 2560 | } |
@@ -4214,9 +4253,6 @@ static int mem_cgroup_do_precharge(unsigned long count) | |||
4214 | goto one_by_one; | 4253 | goto one_by_one; |
4215 | } | 4254 | } |
4216 | mc.precharge += count; | 4255 | mc.precharge += count; |
4217 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
4218 | WARN_ON_ONCE(count > INT_MAX); | ||
4219 | __css_get(&mem->css, (int)count); | ||
4220 | return ret; | 4256 | return ret; |
4221 | } | 4257 | } |
4222 | one_by_one: | 4258 | one_by_one: |
@@ -4452,7 +4488,6 @@ static void mem_cgroup_clear_mc(void) | |||
4452 | } | 4488 | } |
4453 | /* we must fixup refcnts and charges */ | 4489 | /* we must fixup refcnts and charges */ |
4454 | if (mc.moved_swap) { | 4490 | if (mc.moved_swap) { |
4455 | WARN_ON_ONCE(mc.moved_swap > INT_MAX); | ||
4456 | /* uncharge swap account from the old cgroup */ | 4491 | /* uncharge swap account from the old cgroup */ |
4457 | if (!mem_cgroup_is_root(mc.from)) | 4492 | if (!mem_cgroup_is_root(mc.from)) |
4458 | res_counter_uncharge(&mc.from->memsw, | 4493 | res_counter_uncharge(&mc.from->memsw, |
@@ -4466,8 +4501,6 @@ static void mem_cgroup_clear_mc(void) | |||
4466 | */ | 4501 | */ |
4467 | res_counter_uncharge(&mc.to->res, | 4502 | res_counter_uncharge(&mc.to->res, |
4468 | PAGE_SIZE * mc.moved_swap); | 4503 | PAGE_SIZE * mc.moved_swap); |
4469 | VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags)); | ||
4470 | __css_put(&mc.to->css, mc.moved_swap); | ||
4471 | } | 4504 | } |
4472 | /* we've already done mem_cgroup_get(mc.to) */ | 4505 | /* we've already done mem_cgroup_get(mc.to) */ |
4473 | 4506 | ||