diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 248 |
1 files changed, 148 insertions, 100 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0576e9e64586..991860e6e0a7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1047,6 +1047,49 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) | |||
1047 | return swappiness; | 1047 | return swappiness; |
1048 | } | 1048 | } |
1049 | 1049 | ||
1050 | /* A routine for testing mem is not under move_account */ | ||
1051 | |||
1052 | static bool mem_cgroup_under_move(struct mem_cgroup *mem) | ||
1053 | { | ||
1054 | struct mem_cgroup *from = mc.from; | ||
1055 | struct mem_cgroup *to = mc.to; | ||
1056 | bool ret = false; | ||
1057 | |||
1058 | if (from == mem || to == mem) | ||
1059 | return true; | ||
1060 | |||
1061 | if (!from || !to || !mem->use_hierarchy) | ||
1062 | return false; | ||
1063 | |||
1064 | rcu_read_lock(); | ||
1065 | if (css_tryget(&from->css)) { | ||
1066 | ret = css_is_ancestor(&from->css, &mem->css); | ||
1067 | css_put(&from->css); | ||
1068 | } | ||
1069 | if (!ret && css_tryget(&to->css)) { | ||
1070 | ret = css_is_ancestor(&to->css, &mem->css); | ||
1071 | css_put(&to->css); | ||
1072 | } | ||
1073 | rcu_read_unlock(); | ||
1074 | return ret; | ||
1075 | } | ||
1076 | |||
1077 | static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) | ||
1078 | { | ||
1079 | if (mc.moving_task && current != mc.moving_task) { | ||
1080 | if (mem_cgroup_under_move(mem)) { | ||
1081 | DEFINE_WAIT(wait); | ||
1082 | prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); | ||
1083 | /* moving charge context might have finished. */ | ||
1084 | if (mc.moving_task) | ||
1085 | schedule(); | ||
1086 | finish_wait(&mc.waitq, &wait); | ||
1087 | return true; | ||
1088 | } | ||
1089 | } | ||
1090 | return false; | ||
1091 | } | ||
1092 | |||
1050 | static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) | 1093 | static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) |
1051 | { | 1094 | { |
1052 | int *val = data; | 1095 | int *val = data; |
@@ -1575,16 +1618,83 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | |||
1575 | return NOTIFY_OK; | 1618 | return NOTIFY_OK; |
1576 | } | 1619 | } |
1577 | 1620 | ||
1621 | |||
1622 | /* See __mem_cgroup_try_charge() for details */ | ||
1623 | enum { | ||
1624 | CHARGE_OK, /* success */ | ||
1625 | CHARGE_RETRY, /* need to retry but retry is not bad */ | ||
1626 | CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ | ||
1627 | CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ | ||
1628 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ | ||
1629 | }; | ||
1630 | |||
1631 | static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | ||
1632 | int csize, bool oom_check) | ||
1633 | { | ||
1634 | struct mem_cgroup *mem_over_limit; | ||
1635 | struct res_counter *fail_res; | ||
1636 | unsigned long flags = 0; | ||
1637 | int ret; | ||
1638 | |||
1639 | ret = res_counter_charge(&mem->res, csize, &fail_res); | ||
1640 | |||
1641 | if (likely(!ret)) { | ||
1642 | if (!do_swap_account) | ||
1643 | return CHARGE_OK; | ||
1644 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); | ||
1645 | if (likely(!ret)) | ||
1646 | return CHARGE_OK; | ||
1647 | |||
1648 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | ||
1649 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | ||
1650 | } else | ||
1651 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); | ||
1652 | |||
1653 | if (csize > PAGE_SIZE) /* change csize and retry */ | ||
1654 | return CHARGE_RETRY; | ||
1655 | |||
1656 | if (!(gfp_mask & __GFP_WAIT)) | ||
1657 | return CHARGE_WOULDBLOCK; | ||
1658 | |||
1659 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, | ||
1660 | gfp_mask, flags); | ||
1661 | /* | ||
1662 | * try_to_free_mem_cgroup_pages() might not give us a full | ||
1663 | * picture of reclaim. Some pages are reclaimed and might be | ||
1664 | * moved to swap cache or just unmapped from the cgroup. | ||
1665 | * Check the limit again to see if the reclaim reduced the | ||
1666 | * current usage of the cgroup before giving up | ||
1667 | */ | ||
1668 | if (ret || mem_cgroup_check_under_limit(mem_over_limit)) | ||
1669 | return CHARGE_RETRY; | ||
1670 | |||
1671 | /* | ||
1672 | * At task move, charge accounts can be doubly counted. So, it's | ||
1673 | * better to wait until the end of task_move if something is going on. | ||
1674 | */ | ||
1675 | if (mem_cgroup_wait_acct_move(mem_over_limit)) | ||
1676 | return CHARGE_RETRY; | ||
1677 | |||
1678 | /* If we don't need to call oom-killer at el, return immediately */ | ||
1679 | if (!oom_check) | ||
1680 | return CHARGE_NOMEM; | ||
1681 | /* check OOM */ | ||
1682 | if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) | ||
1683 | return CHARGE_OOM_DIE; | ||
1684 | |||
1685 | return CHARGE_RETRY; | ||
1686 | } | ||
1687 | |||
1578 | /* | 1688 | /* |
1579 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 1689 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
1580 | * oom-killer can be invoked. | 1690 | * oom-killer can be invoked. |
1581 | */ | 1691 | */ |
1582 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1692 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
1583 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) | 1693 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) |
1584 | { | 1694 | { |
1585 | struct mem_cgroup *mem, *mem_over_limit; | 1695 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1586 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1696 | struct mem_cgroup *mem = NULL; |
1587 | struct res_counter *fail_res; | 1697 | int ret; |
1588 | int csize = CHARGE_SIZE; | 1698 | int csize = CHARGE_SIZE; |
1589 | 1699 | ||
1590 | /* | 1700 | /* |
@@ -1602,120 +1712,56 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1602 | * thread group leader migrates. It's possible that mm is not | 1712 | * thread group leader migrates. It's possible that mm is not |
1603 | * set, if so charge the init_mm (happens for pagecache usage). | 1713 | * set, if so charge the init_mm (happens for pagecache usage). |
1604 | */ | 1714 | */ |
1605 | mem = *memcg; | 1715 | if (*memcg) { |
1606 | if (likely(!mem)) { | 1716 | mem = *memcg; |
1717 | css_get(&mem->css); | ||
1718 | } else { | ||
1607 | mem = try_get_mem_cgroup_from_mm(mm); | 1719 | mem = try_get_mem_cgroup_from_mm(mm); |
1720 | if (unlikely(!mem)) | ||
1721 | return 0; | ||
1608 | *memcg = mem; | 1722 | *memcg = mem; |
1609 | } else { | ||
1610 | css_get(&mem->css); | ||
1611 | } | 1723 | } |
1612 | if (unlikely(!mem)) | ||
1613 | return 0; | ||
1614 | 1724 | ||
1615 | VM_BUG_ON(css_is_removed(&mem->css)); | 1725 | VM_BUG_ON(css_is_removed(&mem->css)); |
1616 | if (mem_cgroup_is_root(mem)) | 1726 | if (mem_cgroup_is_root(mem)) |
1617 | goto done; | 1727 | goto done; |
1618 | 1728 | ||
1619 | while (1) { | 1729 | do { |
1620 | int ret = 0; | 1730 | bool oom_check; |
1621 | unsigned long flags = 0; | ||
1622 | 1731 | ||
1623 | if (consume_stock(mem)) | 1732 | if (consume_stock(mem)) |
1624 | goto done; | 1733 | goto done; /* don't need to fill stock */ |
1625 | 1734 | /* If killed, bypass charge */ | |
1626 | ret = res_counter_charge(&mem->res, csize, &fail_res); | 1735 | if (fatal_signal_pending(current)) |
1627 | if (likely(!ret)) { | 1736 | goto bypass; |
1628 | if (!do_swap_account) | ||
1629 | break; | ||
1630 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); | ||
1631 | if (likely(!ret)) | ||
1632 | break; | ||
1633 | /* mem+swap counter fails */ | ||
1634 | res_counter_uncharge(&mem->res, csize); | ||
1635 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | ||
1636 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | ||
1637 | memsw); | ||
1638 | } else | ||
1639 | /* mem counter fails */ | ||
1640 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | ||
1641 | res); | ||
1642 | 1737 | ||
1643 | /* reduce request size and retry */ | 1738 | oom_check = false; |
1644 | if (csize > PAGE_SIZE) { | 1739 | if (oom && !nr_oom_retries) { |
1645 | csize = PAGE_SIZE; | 1740 | oom_check = true; |
1646 | continue; | 1741 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1647 | } | 1742 | } |
1648 | if (!(gfp_mask & __GFP_WAIT)) | ||
1649 | goto nomem; | ||
1650 | 1743 | ||
1651 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, | 1744 | ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); |
1652 | gfp_mask, flags); | ||
1653 | if (ret) | ||
1654 | continue; | ||
1655 | 1745 | ||
1656 | /* | 1746 | switch (ret) { |
1657 | * try_to_free_mem_cgroup_pages() might not give us a full | 1747 | case CHARGE_OK: |
1658 | * picture of reclaim. Some pages are reclaimed and might be | 1748 | break; |
1659 | * moved to swap cache or just unmapped from the cgroup. | 1749 | case CHARGE_RETRY: /* not in OOM situation but retry */ |
1660 | * Check the limit again to see if the reclaim reduced the | 1750 | csize = PAGE_SIZE; |
1661 | * current usage of the cgroup before giving up | 1751 | break; |
1662 | * | 1752 | case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ |
1663 | */ | 1753 | goto nomem; |
1664 | if (mem_cgroup_check_under_limit(mem_over_limit)) | 1754 | case CHARGE_NOMEM: /* OOM routine works */ |
1665 | continue; | ||
1666 | |||
1667 | /* try to avoid oom while someone is moving charge */ | ||
1668 | if (mc.moving_task && current != mc.moving_task) { | ||
1669 | struct mem_cgroup *from, *to; | ||
1670 | bool do_continue = false; | ||
1671 | /* | ||
1672 | * There is a small race that "from" or "to" can be | ||
1673 | * freed by rmdir, so we use css_tryget(). | ||
1674 | */ | ||
1675 | from = mc.from; | ||
1676 | to = mc.to; | ||
1677 | if (from && css_tryget(&from->css)) { | ||
1678 | if (mem_over_limit->use_hierarchy) | ||
1679 | do_continue = css_is_ancestor( | ||
1680 | &from->css, | ||
1681 | &mem_over_limit->css); | ||
1682 | else | ||
1683 | do_continue = (from == mem_over_limit); | ||
1684 | css_put(&from->css); | ||
1685 | } | ||
1686 | if (!do_continue && to && css_tryget(&to->css)) { | ||
1687 | if (mem_over_limit->use_hierarchy) | ||
1688 | do_continue = css_is_ancestor( | ||
1689 | &to->css, | ||
1690 | &mem_over_limit->css); | ||
1691 | else | ||
1692 | do_continue = (to == mem_over_limit); | ||
1693 | css_put(&to->css); | ||
1694 | } | ||
1695 | if (do_continue) { | ||
1696 | DEFINE_WAIT(wait); | ||
1697 | prepare_to_wait(&mc.waitq, &wait, | ||
1698 | TASK_INTERRUPTIBLE); | ||
1699 | /* moving charge context might have finished. */ | ||
1700 | if (mc.moving_task) | ||
1701 | schedule(); | ||
1702 | finish_wait(&mc.waitq, &wait); | ||
1703 | continue; | ||
1704 | } | ||
1705 | } | ||
1706 | |||
1707 | if (!nr_retries--) { | ||
1708 | if (!oom) | 1755 | if (!oom) |
1709 | goto nomem; | 1756 | goto nomem; |
1710 | if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { | 1757 | /* If oom, we never return -ENOMEM */ |
1711 | nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1758 | nr_oom_retries--; |
1712 | continue; | 1759 | break; |
1713 | } | 1760 | case CHARGE_OOM_DIE: /* Killed by OOM Killer */ |
1714 | /* When we reach here, current task is dying .*/ | ||
1715 | css_put(&mem->css); | ||
1716 | goto bypass; | 1761 | goto bypass; |
1717 | } | 1762 | } |
1718 | } | 1763 | } while (ret != CHARGE_OK); |
1764 | |||
1719 | if (csize > PAGE_SIZE) | 1765 | if (csize > PAGE_SIZE) |
1720 | refill_stock(mem, csize - PAGE_SIZE); | 1766 | refill_stock(mem, csize - PAGE_SIZE); |
1721 | done: | 1767 | done: |
@@ -1724,6 +1770,8 @@ nomem: | |||
1724 | css_put(&mem->css); | 1770 | css_put(&mem->css); |
1725 | return -ENOMEM; | 1771 | return -ENOMEM; |
1726 | bypass: | 1772 | bypass: |
1773 | if (mem) | ||
1774 | css_put(&mem->css); | ||
1727 | *memcg = NULL; | 1775 | *memcg = NULL; |
1728 | return 0; | 1776 | return 0; |
1729 | } | 1777 | } |