aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2010-08-10 21:03:02 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-08-11 11:59:19 -0400
commitf75ca962037ffd639a44fd88933cd9b84c4c4411 (patch)
tree4773cd57ae45831c11783cd355f9dcb516eb66f0
parent158e0a2d1b3cffed8b46cbc56393a1394672ef79 (diff)
memcg: avoid css_get()
Now, memory cgroup increments css(cgroup subsys state)'s reference count per a charged page. And the reference count is kept until the page is uncharged. But this has 2 bad effect. 1. Because css_get/put calls atomic_inc()/dec, heavy call of them on large smp will not scale well. 2. Because css's refcnt cannot be in a state as "ready-to-release", cgroup's notify_on_release handler can't work with memcg. 3. css's refcnt is atomic_t, it means smaller than 32bit. Maybe too small. This has been a problem since the 1st merge of memcg. This is a trial to remove css's refcnt per a page. Even if we remove refcnt, pre_destroy() does enough synchronization as - check res->usage == 0. - check no pages on LRU. This patch removes css's refcnt per page. Even after this patch, at the 1st look, it seems css_get() is still called in try_charge(). But the logic is. - If a memcg of mm->owner is cached one, consume_stock() will work. At success, return immediately. - If consume_stock returns false, css_get() is called and go to slow path which may be blocked. At the end of slow path, css_put() is called and restart from the start if necessary. So, in the fast path, we don't call css_get() and can avoid access to shared counter. This patch can make the most possible case fast. Here is a result of multi-threaded page fault benchmark. [Before] 25.32% multi-fault-all [kernel.kallsyms] [k] clear_page_c 9.30% multi-fault-all [kernel.kallsyms] [k] _raw_spin_lock_irqsave 8.02% multi-fault-all [kernel.kallsyms] [k] try_get_mem_cgroup_from_mm <=====(*) 7.83% multi-fault-all [kernel.kallsyms] [k] down_read_trylock 5.38% multi-fault-all [kernel.kallsyms] [k] __css_put 5.29% multi-fault-all [kernel.kallsyms] [k] __alloc_pages_nodemask 4.92% multi-fault-all [kernel.kallsyms] [k] _raw_spin_lock_irq 4.24% multi-fault-all [kernel.kallsyms] [k] up_read 3.53% multi-fault-all [kernel.kallsyms] [k] css_put 2.11% multi-fault-all [kernel.kallsyms] [k] handle_mm_fault 1.76% multi-fault-all [kernel.kallsyms] [k] __rmqueue 1.64% multi-fault-all [kernel.kallsyms] [k] __mem_cgroup_commit_charge [After] 28.41% multi-fault-all [kernel.kallsyms] [k] clear_page_c 10.08% multi-fault-all [kernel.kallsyms] [k] _raw_spin_lock_irq 9.58% multi-fault-all [kernel.kallsyms] [k] down_read_trylock 9.38% multi-fault-all [kernel.kallsyms] [k] _raw_spin_lock_irqsave 5.86% multi-fault-all [kernel.kallsyms] [k] __alloc_pages_nodemask 5.65% multi-fault-all [kernel.kallsyms] [k] up_read 2.82% multi-fault-all [kernel.kallsyms] [k] handle_mm_fault 2.64% multi-fault-all [kernel.kallsyms] [k] mem_cgroup_add_lru_list 2.48% multi-fault-all [kernel.kallsyms] [k] __mem_cgroup_commit_charge Then, 8.02% of try_get_mem_cgroup_from_mm() disappears because this patch removes css_tryget() in it. (But yes, this is an extreme case.) Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@in.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/memcontrol.c119
1 files changed, 76 insertions, 43 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f52b0a1861c4..a1c3c317a4dd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1714,28 +1714,66 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1714 * thread group leader migrates. It's possible that mm is not 1714 * thread group leader migrates. It's possible that mm is not
1715 * set, if so charge the init_mm (happens for pagecache usage). 1715 * set, if so charge the init_mm (happens for pagecache usage).
1716 */ 1716 */
1717 if (*memcg) { 1717 if (!*memcg && !mm)
1718 goto bypass;
1719again:
1720 if (*memcg) { /* css should be a valid one */
1718 mem = *memcg; 1721 mem = *memcg;
1722 VM_BUG_ON(css_is_removed(&mem->css));
1723 if (mem_cgroup_is_root(mem))
1724 goto done;
1725 if (consume_stock(mem))
1726 goto done;
1719 css_get(&mem->css); 1727 css_get(&mem->css);
1720 } else { 1728 } else {
1721 mem = try_get_mem_cgroup_from_mm(mm); 1729 struct task_struct *p;
1722 if (unlikely(!mem))
1723 return 0;
1724 *memcg = mem;
1725 }
1726 1730
1727 VM_BUG_ON(css_is_removed(&mem->css)); 1731 rcu_read_lock();
1728 if (mem_cgroup_is_root(mem)) 1732 p = rcu_dereference(mm->owner);
1729 goto done; 1733 VM_BUG_ON(!p);
1734 /*
1735 * because we don't have task_lock(), "p" can exit while
1736 * we're here. In that case, "mem" can point to root
1737 * cgroup but never be NULL. (and task_struct itself is freed
1738 * by RCU, cgroup itself is RCU safe.) Then, we have small
1739 * risk here to get wrong cgroup. But such kind of mis-account
1740 * by race always happens because we don't have cgroup_mutex().
1741 * It's overkill and we allow that small race, here.
1742 */
1743 mem = mem_cgroup_from_task(p);
1744 VM_BUG_ON(!mem);
1745 if (mem_cgroup_is_root(mem)) {
1746 rcu_read_unlock();
1747 goto done;
1748 }
1749 if (consume_stock(mem)) {
1750 /*
1751 * It seems dagerous to access memcg without css_get().
1752 * But considering how consume_stok works, it's not
1753 * necessary. If consume_stock success, some charges
1754 * from this memcg are cached on this cpu. So, we
1755 * don't need to call css_get()/css_tryget() before
1756 * calling consume_stock().
1757 */
1758 rcu_read_unlock();
1759 goto done;
1760 }
1761 /* after here, we may be blocked. we need to get refcnt */
1762 if (!css_tryget(&mem->css)) {
1763 rcu_read_unlock();
1764 goto again;
1765 }
1766 rcu_read_unlock();
1767 }
1730 1768
1731 do { 1769 do {
1732 bool oom_check; 1770 bool oom_check;
1733 1771
1734 if (consume_stock(mem))
1735 goto done; /* don't need to fill stock */
1736 /* If killed, bypass charge */ 1772 /* If killed, bypass charge */
1737 if (fatal_signal_pending(current)) 1773 if (fatal_signal_pending(current)) {
1774 css_put(&mem->css);
1738 goto bypass; 1775 goto bypass;
1776 }
1739 1777
1740 oom_check = false; 1778 oom_check = false;
1741 if (oom && !nr_oom_retries) { 1779 if (oom && !nr_oom_retries) {
@@ -1750,30 +1788,36 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1750 break; 1788 break;
1751 case CHARGE_RETRY: /* not in OOM situation but retry */ 1789 case CHARGE_RETRY: /* not in OOM situation but retry */
1752 csize = PAGE_SIZE; 1790 csize = PAGE_SIZE;
1753 break; 1791 css_put(&mem->css);
1792 mem = NULL;
1793 goto again;
1754 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 1794 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
1795 css_put(&mem->css);
1755 goto nomem; 1796 goto nomem;
1756 case CHARGE_NOMEM: /* OOM routine works */ 1797 case CHARGE_NOMEM: /* OOM routine works */
1757 if (!oom) 1798 if (!oom) {
1799 css_put(&mem->css);
1758 goto nomem; 1800 goto nomem;
1801 }
1759 /* If oom, we never return -ENOMEM */ 1802 /* If oom, we never return -ENOMEM */
1760 nr_oom_retries--; 1803 nr_oom_retries--;
1761 break; 1804 break;
1762 case CHARGE_OOM_DIE: /* Killed by OOM Killer */ 1805 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
1806 css_put(&mem->css);
1763 goto bypass; 1807 goto bypass;
1764 } 1808 }
1765 } while (ret != CHARGE_OK); 1809 } while (ret != CHARGE_OK);
1766 1810
1767 if (csize > PAGE_SIZE) 1811 if (csize > PAGE_SIZE)
1768 refill_stock(mem, csize - PAGE_SIZE); 1812 refill_stock(mem, csize - PAGE_SIZE);
1813 css_put(&mem->css);
1769done: 1814done:
1815 *memcg = mem;
1770 return 0; 1816 return 0;
1771nomem: 1817nomem:
1772 css_put(&mem->css); 1818 *memcg = NULL;
1773 return -ENOMEM; 1819 return -ENOMEM;
1774bypass: 1820bypass:
1775 if (mem)
1776 css_put(&mem->css);
1777 *memcg = NULL; 1821 *memcg = NULL;
1778 return 0; 1822 return 0;
1779} 1823}
@@ -1790,11 +1834,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
1790 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 1834 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
1791 if (do_swap_account) 1835 if (do_swap_account)
1792 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); 1836 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
1793 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
1794 WARN_ON_ONCE(count > INT_MAX);
1795 __css_put(&mem->css, (int)count);
1796 } 1837 }
1797 /* we don't need css_put for root */
1798} 1838}
1799 1839
1800static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 1840static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
@@ -2155,7 +2195,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2155 goto charge_cur_mm; 2195 goto charge_cur_mm;
2156 *ptr = mem; 2196 *ptr = mem;
2157 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 2197 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
2158 /* drop extra refcnt from tryget */
2159 css_put(&mem->css); 2198 css_put(&mem->css);
2160 return ret; 2199 return ret;
2161charge_cur_mm: 2200charge_cur_mm:
@@ -2325,10 +2364,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2325 break; 2364 break;
2326 } 2365 }
2327 2366
2328 if (!mem_cgroup_is_root(mem))
2329 __do_uncharge(mem, ctype);
2330 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2331 mem_cgroup_swap_statistics(mem, true);
2332 mem_cgroup_charge_statistics(mem, pc, false); 2367 mem_cgroup_charge_statistics(mem, pc, false);
2333 2368
2334 ClearPageCgroupUsed(pc); 2369 ClearPageCgroupUsed(pc);
@@ -2340,11 +2375,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2340 */ 2375 */
2341 2376
2342 unlock_page_cgroup(pc); 2377 unlock_page_cgroup(pc);
2343 2378 /*
2379 * even after unlock, we have mem->res.usage here and this memcg
2380 * will never be freed.
2381 */
2344 memcg_check_events(mem, page); 2382 memcg_check_events(mem, page);
2345 /* at swapout, this memcg will be accessed to record to swap */ 2383 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
2346 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2384 mem_cgroup_swap_statistics(mem, true);
2347 css_put(&mem->css); 2385 mem_cgroup_get(mem);
2386 }
2387 if (!mem_cgroup_is_root(mem))
2388 __do_uncharge(mem, ctype);
2348 2389
2349 return mem; 2390 return mem;
2350 2391
@@ -2431,13 +2472,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
2431 2472
2432 memcg = __mem_cgroup_uncharge_common(page, ctype); 2473 memcg = __mem_cgroup_uncharge_common(page, ctype);
2433 2474
2434 /* record memcg information */ 2475 /*
2435 if (do_swap_account && swapout && memcg) { 2476 * record memcg information, if swapout && memcg != NULL,
2477 * mem_cgroup_get() was called in uncharge().
2478 */
2479 if (do_swap_account && swapout && memcg)
2436 swap_cgroup_record(ent, css_id(&memcg->css)); 2480 swap_cgroup_record(ent, css_id(&memcg->css));
2437 mem_cgroup_get(memcg);
2438 }
2439 if (swapout && memcg)
2440 css_put(&memcg->css);
2441} 2481}
2442#endif 2482#endif
2443 2483
@@ -2515,7 +2555,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
2515 */ 2555 */
2516 if (!mem_cgroup_is_root(to)) 2556 if (!mem_cgroup_is_root(to))
2517 res_counter_uncharge(&to->res, PAGE_SIZE); 2557 res_counter_uncharge(&to->res, PAGE_SIZE);
2518 css_put(&to->css);
2519 } 2558 }
2520 return 0; 2559 return 0;
2521 } 2560 }
@@ -4214,9 +4253,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
4214 goto one_by_one; 4253 goto one_by_one;
4215 } 4254 }
4216 mc.precharge += count; 4255 mc.precharge += count;
4217 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
4218 WARN_ON_ONCE(count > INT_MAX);
4219 __css_get(&mem->css, (int)count);
4220 return ret; 4256 return ret;
4221 } 4257 }
4222one_by_one: 4258one_by_one:
@@ -4452,7 +4488,6 @@ static void mem_cgroup_clear_mc(void)
4452 } 4488 }
4453 /* we must fixup refcnts and charges */ 4489 /* we must fixup refcnts and charges */
4454 if (mc.moved_swap) { 4490 if (mc.moved_swap) {
4455 WARN_ON_ONCE(mc.moved_swap > INT_MAX);
4456 /* uncharge swap account from the old cgroup */ 4491 /* uncharge swap account from the old cgroup */
4457 if (!mem_cgroup_is_root(mc.from)) 4492 if (!mem_cgroup_is_root(mc.from))
4458 res_counter_uncharge(&mc.from->memsw, 4493 res_counter_uncharge(&mc.from->memsw,
@@ -4466,8 +4501,6 @@ static void mem_cgroup_clear_mc(void)
4466 */ 4501 */
4467 res_counter_uncharge(&mc.to->res, 4502 res_counter_uncharge(&mc.to->res,
4468 PAGE_SIZE * mc.moved_swap); 4503 PAGE_SIZE * mc.moved_swap);
4469 VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
4470 __css_put(&mc.to->css, mc.moved_swap);
4471 } 4504 }
4472 /* we've already done mem_cgroup_get(mc.to) */ 4505 /* we've already done mem_cgroup_get(mc.to) */
4473 4506