aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2010-08-10 21:02:57 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-08-11 11:59:18 -0400
commit4b53433468c87794b523e4683fbd4e8e8aca1f63 (patch)
tree08f80fd5ec5d824a28aa4e0b15c7e2e83eed9fec
parent65e0e811667dedd4f19b268df9d856ecacb629de (diff)
memcg: clean up try_charge main loop
mem_cgroup_try_charge() has a big loop in it and seems to be hard to read. Most of routines are for slow path. This patch moves codes out from the loop and make it clear what's done. Summary: - refactoring a function to detect a memcg is under acccount move or not. - refactoring a function to wait for the end of moving task acct. - refactoring a main loop('s slow path) as a function and make it clear why we retry or quit by return code. - add fatal_signal_pending() check for bypassing charge loops. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@in.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/memcontrol.c248
1 files changed, 148 insertions, 100 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0576e9e64586..991860e6e0a7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1047,6 +1047,49 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
1047 return swappiness; 1047 return swappiness;
1048} 1048}
1049 1049
1050/* A routine for testing mem is not under move_account */
1051
1052static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1053{
1054 struct mem_cgroup *from = mc.from;
1055 struct mem_cgroup *to = mc.to;
1056 bool ret = false;
1057
1058 if (from == mem || to == mem)
1059 return true;
1060
1061 if (!from || !to || !mem->use_hierarchy)
1062 return false;
1063
1064 rcu_read_lock();
1065 if (css_tryget(&from->css)) {
1066 ret = css_is_ancestor(&from->css, &mem->css);
1067 css_put(&from->css);
1068 }
1069 if (!ret && css_tryget(&to->css)) {
1070 ret = css_is_ancestor(&to->css, &mem->css);
1071 css_put(&to->css);
1072 }
1073 rcu_read_unlock();
1074 return ret;
1075}
1076
1077static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1078{
1079 if (mc.moving_task && current != mc.moving_task) {
1080 if (mem_cgroup_under_move(mem)) {
1081 DEFINE_WAIT(wait);
1082 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1083 /* moving charge context might have finished. */
1084 if (mc.moving_task)
1085 schedule();
1086 finish_wait(&mc.waitq, &wait);
1087 return true;
1088 }
1089 }
1090 return false;
1091}
1092
1050static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) 1093static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1051{ 1094{
1052 int *val = data; 1095 int *val = data;
@@ -1575,16 +1618,83 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1575 return NOTIFY_OK; 1618 return NOTIFY_OK;
1576} 1619}
1577 1620
1621
1622/* See __mem_cgroup_try_charge() for details */
1623enum {
1624 CHARGE_OK, /* success */
1625 CHARGE_RETRY, /* need to retry but retry is not bad */
1626 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
1627 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
1628 CHARGE_OOM_DIE, /* the current is killed because of OOM */
1629};
1630
1631static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1632 int csize, bool oom_check)
1633{
1634 struct mem_cgroup *mem_over_limit;
1635 struct res_counter *fail_res;
1636 unsigned long flags = 0;
1637 int ret;
1638
1639 ret = res_counter_charge(&mem->res, csize, &fail_res);
1640
1641 if (likely(!ret)) {
1642 if (!do_swap_account)
1643 return CHARGE_OK;
1644 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1645 if (likely(!ret))
1646 return CHARGE_OK;
1647
1648 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1649 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1650 } else
1651 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1652
1653 if (csize > PAGE_SIZE) /* change csize and retry */
1654 return CHARGE_RETRY;
1655
1656 if (!(gfp_mask & __GFP_WAIT))
1657 return CHARGE_WOULDBLOCK;
1658
1659 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1660 gfp_mask, flags);
1661 /*
1662 * try_to_free_mem_cgroup_pages() might not give us a full
1663 * picture of reclaim. Some pages are reclaimed and might be
1664 * moved to swap cache or just unmapped from the cgroup.
1665 * Check the limit again to see if the reclaim reduced the
1666 * current usage of the cgroup before giving up
1667 */
1668 if (ret || mem_cgroup_check_under_limit(mem_over_limit))
1669 return CHARGE_RETRY;
1670
1671 /*
1672 * At task move, charge accounts can be doubly counted. So, it's
1673 * better to wait until the end of task_move if something is going on.
1674 */
1675 if (mem_cgroup_wait_acct_move(mem_over_limit))
1676 return CHARGE_RETRY;
1677
1678 /* If we don't need to call oom-killer at el, return immediately */
1679 if (!oom_check)
1680 return CHARGE_NOMEM;
1681 /* check OOM */
1682 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
1683 return CHARGE_OOM_DIE;
1684
1685 return CHARGE_RETRY;
1686}
1687
1578/* 1688/*
1579 * Unlike exported interface, "oom" parameter is added. if oom==true, 1689 * Unlike exported interface, "oom" parameter is added. if oom==true,
1580 * oom-killer can be invoked. 1690 * oom-killer can be invoked.
1581 */ 1691 */
1582static int __mem_cgroup_try_charge(struct mm_struct *mm, 1692static int __mem_cgroup_try_charge(struct mm_struct *mm,
1583 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) 1693 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
1584{ 1694{
1585 struct mem_cgroup *mem, *mem_over_limit; 1695 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1586 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1696 struct mem_cgroup *mem = NULL;
1587 struct res_counter *fail_res; 1697 int ret;
1588 int csize = CHARGE_SIZE; 1698 int csize = CHARGE_SIZE;
1589 1699
1590 /* 1700 /*
@@ -1602,120 +1712,56 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1602 * thread group leader migrates. It's possible that mm is not 1712 * thread group leader migrates. It's possible that mm is not
1603 * set, if so charge the init_mm (happens for pagecache usage). 1713 * set, if so charge the init_mm (happens for pagecache usage).
1604 */ 1714 */
1605 mem = *memcg; 1715 if (*memcg) {
1606 if (likely(!mem)) { 1716 mem = *memcg;
1717 css_get(&mem->css);
1718 } else {
1607 mem = try_get_mem_cgroup_from_mm(mm); 1719 mem = try_get_mem_cgroup_from_mm(mm);
1720 if (unlikely(!mem))
1721 return 0;
1608 *memcg = mem; 1722 *memcg = mem;
1609 } else {
1610 css_get(&mem->css);
1611 } 1723 }
1612 if (unlikely(!mem))
1613 return 0;
1614 1724
1615 VM_BUG_ON(css_is_removed(&mem->css)); 1725 VM_BUG_ON(css_is_removed(&mem->css));
1616 if (mem_cgroup_is_root(mem)) 1726 if (mem_cgroup_is_root(mem))
1617 goto done; 1727 goto done;
1618 1728
1619 while (1) { 1729 do {
1620 int ret = 0; 1730 bool oom_check;
1621 unsigned long flags = 0;
1622 1731
1623 if (consume_stock(mem)) 1732 if (consume_stock(mem))
1624 goto done; 1733 goto done; /* don't need to fill stock */
1625 1734 /* If killed, bypass charge */
1626 ret = res_counter_charge(&mem->res, csize, &fail_res); 1735 if (fatal_signal_pending(current))
1627 if (likely(!ret)) { 1736 goto bypass;
1628 if (!do_swap_account)
1629 break;
1630 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1631 if (likely(!ret))
1632 break;
1633 /* mem+swap counter fails */
1634 res_counter_uncharge(&mem->res, csize);
1635 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1636 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1637 memsw);
1638 } else
1639 /* mem counter fails */
1640 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1641 res);
1642 1737
1643 /* reduce request size and retry */ 1738 oom_check = false;
1644 if (csize > PAGE_SIZE) { 1739 if (oom && !nr_oom_retries) {
1645 csize = PAGE_SIZE; 1740 oom_check = true;
1646 continue; 1741 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1647 } 1742 }
1648 if (!(gfp_mask & __GFP_WAIT))
1649 goto nomem;
1650 1743
1651 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1744 ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
1652 gfp_mask, flags);
1653 if (ret)
1654 continue;
1655 1745
1656 /* 1746 switch (ret) {
1657 * try_to_free_mem_cgroup_pages() might not give us a full 1747 case CHARGE_OK:
1658 * picture of reclaim. Some pages are reclaimed and might be 1748 break;
1659 * moved to swap cache or just unmapped from the cgroup. 1749 case CHARGE_RETRY: /* not in OOM situation but retry */
1660 * Check the limit again to see if the reclaim reduced the 1750 csize = PAGE_SIZE;
1661 * current usage of the cgroup before giving up 1751 break;
1662 * 1752 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
1663 */ 1753 goto nomem;
1664 if (mem_cgroup_check_under_limit(mem_over_limit)) 1754 case CHARGE_NOMEM: /* OOM routine works */
1665 continue;
1666
1667 /* try to avoid oom while someone is moving charge */
1668 if (mc.moving_task && current != mc.moving_task) {
1669 struct mem_cgroup *from, *to;
1670 bool do_continue = false;
1671 /*
1672 * There is a small race that "from" or "to" can be
1673 * freed by rmdir, so we use css_tryget().
1674 */
1675 from = mc.from;
1676 to = mc.to;
1677 if (from && css_tryget(&from->css)) {
1678 if (mem_over_limit->use_hierarchy)
1679 do_continue = css_is_ancestor(
1680 &from->css,
1681 &mem_over_limit->css);
1682 else
1683 do_continue = (from == mem_over_limit);
1684 css_put(&from->css);
1685 }
1686 if (!do_continue && to && css_tryget(&to->css)) {
1687 if (mem_over_limit->use_hierarchy)
1688 do_continue = css_is_ancestor(
1689 &to->css,
1690 &mem_over_limit->css);
1691 else
1692 do_continue = (to == mem_over_limit);
1693 css_put(&to->css);
1694 }
1695 if (do_continue) {
1696 DEFINE_WAIT(wait);
1697 prepare_to_wait(&mc.waitq, &wait,
1698 TASK_INTERRUPTIBLE);
1699 /* moving charge context might have finished. */
1700 if (mc.moving_task)
1701 schedule();
1702 finish_wait(&mc.waitq, &wait);
1703 continue;
1704 }
1705 }
1706
1707 if (!nr_retries--) {
1708 if (!oom) 1755 if (!oom)
1709 goto nomem; 1756 goto nomem;
1710 if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { 1757 /* If oom, we never return -ENOMEM */
1711 nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1758 nr_oom_retries--;
1712 continue; 1759 break;
1713 } 1760 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
1714 /* When we reach here, current task is dying .*/
1715 css_put(&mem->css);
1716 goto bypass; 1761 goto bypass;
1717 } 1762 }
1718 } 1763 } while (ret != CHARGE_OK);
1764
1719 if (csize > PAGE_SIZE) 1765 if (csize > PAGE_SIZE)
1720 refill_stock(mem, csize - PAGE_SIZE); 1766 refill_stock(mem, csize - PAGE_SIZE);
1721done: 1767done:
@@ -1724,6 +1770,8 @@ nomem:
1724 css_put(&mem->css); 1770 css_put(&mem->css);
1725 return -ENOMEM; 1771 return -ENOMEM;
1726bypass: 1772bypass:
1773 if (mem)
1774 css_put(&mem->css);
1727 *memcg = NULL; 1775 *memcg = NULL;
1728 return 0; 1776 return 0;
1729} 1777}