diff options
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 407 |
1 files changed, 243 insertions, 164 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0576e9e64586..3eed583895a6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -47,6 +47,7 @@ | |||
| 47 | #include <linux/mm_inline.h> | 47 | #include <linux/mm_inline.h> |
| 48 | #include <linux/page_cgroup.h> | 48 | #include <linux/page_cgroup.h> |
| 49 | #include <linux/cpu.h> | 49 | #include <linux/cpu.h> |
| 50 | #include <linux/oom.h> | ||
| 50 | #include "internal.h" | 51 | #include "internal.h" |
| 51 | 52 | ||
| 52 | #include <asm/uaccess.h> | 53 | #include <asm/uaccess.h> |
| @@ -268,6 +269,7 @@ enum move_type { | |||
| 268 | 269 | ||
| 269 | /* "mc" and its members are protected by cgroup_mutex */ | 270 | /* "mc" and its members are protected by cgroup_mutex */ |
| 270 | static struct move_charge_struct { | 271 | static struct move_charge_struct { |
| 272 | spinlock_t lock; /* for from, to, moving_task */ | ||
| 271 | struct mem_cgroup *from; | 273 | struct mem_cgroup *from; |
| 272 | struct mem_cgroup *to; | 274 | struct mem_cgroup *to; |
| 273 | unsigned long precharge; | 275 | unsigned long precharge; |
| @@ -276,6 +278,7 @@ static struct move_charge_struct { | |||
| 276 | struct task_struct *moving_task; /* a task moving charges */ | 278 | struct task_struct *moving_task; /* a task moving charges */ |
| 277 | wait_queue_head_t waitq; /* a waitq for other context */ | 279 | wait_queue_head_t waitq; /* a waitq for other context */ |
| 278 | } mc = { | 280 | } mc = { |
| 281 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), | ||
| 279 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | 282 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), |
| 280 | }; | 283 | }; |
| 281 | 284 | ||
| @@ -836,12 +839,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
| 836 | { | 839 | { |
| 837 | int ret; | 840 | int ret; |
| 838 | struct mem_cgroup *curr = NULL; | 841 | struct mem_cgroup *curr = NULL; |
| 842 | struct task_struct *p; | ||
| 839 | 843 | ||
| 840 | task_lock(task); | 844 | p = find_lock_task_mm(task); |
| 841 | rcu_read_lock(); | 845 | if (!p) |
| 842 | curr = try_get_mem_cgroup_from_mm(task->mm); | 846 | return 0; |
| 843 | rcu_read_unlock(); | 847 | curr = try_get_mem_cgroup_from_mm(p->mm); |
| 844 | task_unlock(task); | 848 | task_unlock(p); |
| 845 | if (!curr) | 849 | if (!curr) |
| 846 | return 0; | 850 | return 0; |
| 847 | /* | 851 | /* |
| @@ -915,7 +919,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, | |||
| 915 | struct zone *zone, | 919 | struct zone *zone, |
| 916 | enum lru_list lru) | 920 | enum lru_list lru) |
| 917 | { | 921 | { |
| 918 | int nid = zone->zone_pgdat->node_id; | 922 | int nid = zone_to_nid(zone); |
| 919 | int zid = zone_idx(zone); | 923 | int zid = zone_idx(zone); |
| 920 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); | 924 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
| 921 | 925 | ||
| @@ -925,7 +929,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, | |||
| 925 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, | 929 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, |
| 926 | struct zone *zone) | 930 | struct zone *zone) |
| 927 | { | 931 | { |
| 928 | int nid = zone->zone_pgdat->node_id; | 932 | int nid = zone_to_nid(zone); |
| 929 | int zid = zone_idx(zone); | 933 | int zid = zone_idx(zone); |
| 930 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); | 934 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
| 931 | 935 | ||
| @@ -970,7 +974,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
| 970 | LIST_HEAD(pc_list); | 974 | LIST_HEAD(pc_list); |
| 971 | struct list_head *src; | 975 | struct list_head *src; |
| 972 | struct page_cgroup *pc, *tmp; | 976 | struct page_cgroup *pc, *tmp; |
| 973 | int nid = z->zone_pgdat->node_id; | 977 | int nid = zone_to_nid(z); |
| 974 | int zid = zone_idx(z); | 978 | int zid = zone_idx(z); |
| 975 | struct mem_cgroup_per_zone *mz; | 979 | struct mem_cgroup_per_zone *mz; |
| 976 | int lru = LRU_FILE * file + active; | 980 | int lru = LRU_FILE * file + active; |
| @@ -1047,6 +1051,47 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) | |||
| 1047 | return swappiness; | 1051 | return swappiness; |
| 1048 | } | 1052 | } |
| 1049 | 1053 | ||
| 1054 | /* A routine for testing mem is not under move_account */ | ||
| 1055 | |||
| 1056 | static bool mem_cgroup_under_move(struct mem_cgroup *mem) | ||
| 1057 | { | ||
| 1058 | struct mem_cgroup *from; | ||
| 1059 | struct mem_cgroup *to; | ||
| 1060 | bool ret = false; | ||
| 1061 | /* | ||
| 1062 | * Unlike task_move routines, we access mc.to, mc.from not under | ||
| 1063 | * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. | ||
| 1064 | */ | ||
| 1065 | spin_lock(&mc.lock); | ||
| 1066 | from = mc.from; | ||
| 1067 | to = mc.to; | ||
| 1068 | if (!from) | ||
| 1069 | goto unlock; | ||
| 1070 | if (from == mem || to == mem | ||
| 1071 | || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) | ||
| 1072 | || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) | ||
| 1073 | ret = true; | ||
| 1074 | unlock: | ||
| 1075 | spin_unlock(&mc.lock); | ||
| 1076 | return ret; | ||
| 1077 | } | ||
| 1078 | |||
| 1079 | static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) | ||
| 1080 | { | ||
| 1081 | if (mc.moving_task && current != mc.moving_task) { | ||
| 1082 | if (mem_cgroup_under_move(mem)) { | ||
| 1083 | DEFINE_WAIT(wait); | ||
| 1084 | prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); | ||
| 1085 | /* moving charge context might have finished. */ | ||
| 1086 | if (mc.moving_task) | ||
| 1087 | schedule(); | ||
| 1088 | finish_wait(&mc.waitq, &wait); | ||
| 1089 | return true; | ||
| 1090 | } | ||
| 1091 | } | ||
| 1092 | return false; | ||
| 1093 | } | ||
| 1094 | |||
| 1050 | static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) | 1095 | static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) |
| 1051 | { | 1096 | { |
| 1052 | int *val = data; | 1097 | int *val = data; |
| @@ -1255,8 +1300,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
| 1255 | /* we use swappiness of local cgroup */ | 1300 | /* we use swappiness of local cgroup */ |
| 1256 | if (check_soft) | 1301 | if (check_soft) |
| 1257 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, | 1302 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, |
| 1258 | noswap, get_swappiness(victim), zone, | 1303 | noswap, get_swappiness(victim), zone); |
| 1259 | zone->zone_pgdat->node_id); | ||
| 1260 | else | 1304 | else |
| 1261 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | 1305 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, |
| 1262 | noswap, get_swappiness(victim)); | 1306 | noswap, get_swappiness(victim)); |
| @@ -1363,7 +1407,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem) | |||
| 1363 | 1407 | ||
| 1364 | static void memcg_oom_recover(struct mem_cgroup *mem) | 1408 | static void memcg_oom_recover(struct mem_cgroup *mem) |
| 1365 | { | 1409 | { |
| 1366 | if (atomic_read(&mem->oom_lock)) | 1410 | if (mem && atomic_read(&mem->oom_lock)) |
| 1367 | memcg_wakeup_oom(mem); | 1411 | memcg_wakeup_oom(mem); |
| 1368 | } | 1412 | } |
| 1369 | 1413 | ||
| @@ -1575,16 +1619,83 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | |||
| 1575 | return NOTIFY_OK; | 1619 | return NOTIFY_OK; |
| 1576 | } | 1620 | } |
| 1577 | 1621 | ||
| 1622 | |||
| 1623 | /* See __mem_cgroup_try_charge() for details */ | ||
| 1624 | enum { | ||
| 1625 | CHARGE_OK, /* success */ | ||
| 1626 | CHARGE_RETRY, /* need to retry but retry is not bad */ | ||
| 1627 | CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ | ||
| 1628 | CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ | ||
| 1629 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ | ||
| 1630 | }; | ||
| 1631 | |||
| 1632 | static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | ||
| 1633 | int csize, bool oom_check) | ||
| 1634 | { | ||
| 1635 | struct mem_cgroup *mem_over_limit; | ||
| 1636 | struct res_counter *fail_res; | ||
| 1637 | unsigned long flags = 0; | ||
| 1638 | int ret; | ||
| 1639 | |||
| 1640 | ret = res_counter_charge(&mem->res, csize, &fail_res); | ||
| 1641 | |||
| 1642 | if (likely(!ret)) { | ||
| 1643 | if (!do_swap_account) | ||
| 1644 | return CHARGE_OK; | ||
| 1645 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); | ||
| 1646 | if (likely(!ret)) | ||
| 1647 | return CHARGE_OK; | ||
| 1648 | |||
| 1649 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | ||
| 1650 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | ||
| 1651 | } else | ||
| 1652 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); | ||
| 1653 | |||
| 1654 | if (csize > PAGE_SIZE) /* change csize and retry */ | ||
| 1655 | return CHARGE_RETRY; | ||
| 1656 | |||
| 1657 | if (!(gfp_mask & __GFP_WAIT)) | ||
| 1658 | return CHARGE_WOULDBLOCK; | ||
| 1659 | |||
| 1660 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, | ||
| 1661 | gfp_mask, flags); | ||
| 1662 | /* | ||
| 1663 | * try_to_free_mem_cgroup_pages() might not give us a full | ||
| 1664 | * picture of reclaim. Some pages are reclaimed and might be | ||
| 1665 | * moved to swap cache or just unmapped from the cgroup. | ||
| 1666 | * Check the limit again to see if the reclaim reduced the | ||
| 1667 | * current usage of the cgroup before giving up | ||
| 1668 | */ | ||
| 1669 | if (ret || mem_cgroup_check_under_limit(mem_over_limit)) | ||
| 1670 | return CHARGE_RETRY; | ||
| 1671 | |||
| 1672 | /* | ||
| 1673 | * At task move, charge accounts can be doubly counted. So, it's | ||
| 1674 | * better to wait until the end of task_move if something is going on. | ||
| 1675 | */ | ||
| 1676 | if (mem_cgroup_wait_acct_move(mem_over_limit)) | ||
| 1677 | return CHARGE_RETRY; | ||
| 1678 | |||
| 1679 | /* If we don't need to call oom-killer at el, return immediately */ | ||
| 1680 | if (!oom_check) | ||
| 1681 | return CHARGE_NOMEM; | ||
| 1682 | /* check OOM */ | ||
| 1683 | if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) | ||
| 1684 | return CHARGE_OOM_DIE; | ||
| 1685 | |||
| 1686 | return CHARGE_RETRY; | ||
| 1687 | } | ||
| 1688 | |||
| 1578 | /* | 1689 | /* |
| 1579 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 1690 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
| 1580 | * oom-killer can be invoked. | 1691 | * oom-killer can be invoked. |
| 1581 | */ | 1692 | */ |
| 1582 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1693 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
| 1583 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) | 1694 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) |
| 1584 | { | 1695 | { |
| 1585 | struct mem_cgroup *mem, *mem_over_limit; | 1696 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
| 1586 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1697 | struct mem_cgroup *mem = NULL; |
| 1587 | struct res_counter *fail_res; | 1698 | int ret; |
| 1588 | int csize = CHARGE_SIZE; | 1699 | int csize = CHARGE_SIZE; |
| 1589 | 1700 | ||
| 1590 | /* | 1701 | /* |
| @@ -1602,126 +1713,108 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
| 1602 | * thread group leader migrates. It's possible that mm is not | 1713 | * thread group leader migrates. It's possible that mm is not |
| 1603 | * set, if so charge the init_mm (happens for pagecache usage). | 1714 | * set, if so charge the init_mm (happens for pagecache usage). |
| 1604 | */ | 1715 | */ |
| 1605 | mem = *memcg; | 1716 | if (!*memcg && !mm) |
| 1606 | if (likely(!mem)) { | 1717 | goto bypass; |
| 1607 | mem = try_get_mem_cgroup_from_mm(mm); | 1718 | again: |
| 1608 | *memcg = mem; | 1719 | if (*memcg) { /* css should be a valid one */ |
| 1609 | } else { | 1720 | mem = *memcg; |
| 1610 | css_get(&mem->css); | 1721 | VM_BUG_ON(css_is_removed(&mem->css)); |
| 1611 | } | 1722 | if (mem_cgroup_is_root(mem)) |
| 1612 | if (unlikely(!mem)) | 1723 | goto done; |
| 1613 | return 0; | ||
| 1614 | |||
| 1615 | VM_BUG_ON(css_is_removed(&mem->css)); | ||
| 1616 | if (mem_cgroup_is_root(mem)) | ||
| 1617 | goto done; | ||
| 1618 | |||
| 1619 | while (1) { | ||
| 1620 | int ret = 0; | ||
| 1621 | unsigned long flags = 0; | ||
| 1622 | |||
| 1623 | if (consume_stock(mem)) | 1724 | if (consume_stock(mem)) |
| 1624 | goto done; | 1725 | goto done; |
| 1726 | css_get(&mem->css); | ||
| 1727 | } else { | ||
| 1728 | struct task_struct *p; | ||
| 1625 | 1729 | ||
| 1626 | ret = res_counter_charge(&mem->res, csize, &fail_res); | 1730 | rcu_read_lock(); |
| 1627 | if (likely(!ret)) { | 1731 | p = rcu_dereference(mm->owner); |
| 1628 | if (!do_swap_account) | 1732 | VM_BUG_ON(!p); |
| 1629 | break; | ||
| 1630 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); | ||
| 1631 | if (likely(!ret)) | ||
| 1632 | break; | ||
| 1633 | /* mem+swap counter fails */ | ||
| 1634 | res_counter_uncharge(&mem->res, csize); | ||
| 1635 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | ||
| 1636 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | ||
| 1637 | memsw); | ||
| 1638 | } else | ||
| 1639 | /* mem counter fails */ | ||
| 1640 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | ||
| 1641 | res); | ||
| 1642 | |||
| 1643 | /* reduce request size and retry */ | ||
| 1644 | if (csize > PAGE_SIZE) { | ||
| 1645 | csize = PAGE_SIZE; | ||
| 1646 | continue; | ||
| 1647 | } | ||
| 1648 | if (!(gfp_mask & __GFP_WAIT)) | ||
| 1649 | goto nomem; | ||
| 1650 | |||
| 1651 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, | ||
| 1652 | gfp_mask, flags); | ||
| 1653 | if (ret) | ||
| 1654 | continue; | ||
| 1655 | |||
| 1656 | /* | 1733 | /* |
| 1657 | * try_to_free_mem_cgroup_pages() might not give us a full | 1734 | * because we don't have task_lock(), "p" can exit while |
| 1658 | * picture of reclaim. Some pages are reclaimed and might be | 1735 | * we're here. In that case, "mem" can point to root |
| 1659 | * moved to swap cache or just unmapped from the cgroup. | 1736 | * cgroup but never be NULL. (and task_struct itself is freed |
| 1660 | * Check the limit again to see if the reclaim reduced the | 1737 | * by RCU, cgroup itself is RCU safe.) Then, we have small |
| 1661 | * current usage of the cgroup before giving up | 1738 | * risk here to get wrong cgroup. But such kind of mis-account |
| 1662 | * | 1739 | * by race always happens because we don't have cgroup_mutex(). |
| 1740 | * It's overkill and we allow that small race, here. | ||
| 1663 | */ | 1741 | */ |
| 1664 | if (mem_cgroup_check_under_limit(mem_over_limit)) | 1742 | mem = mem_cgroup_from_task(p); |
| 1665 | continue; | 1743 | VM_BUG_ON(!mem); |
| 1666 | 1744 | if (mem_cgroup_is_root(mem)) { | |
| 1667 | /* try to avoid oom while someone is moving charge */ | 1745 | rcu_read_unlock(); |
| 1668 | if (mc.moving_task && current != mc.moving_task) { | 1746 | goto done; |
| 1669 | struct mem_cgroup *from, *to; | 1747 | } |
| 1670 | bool do_continue = false; | 1748 | if (consume_stock(mem)) { |
| 1671 | /* | 1749 | /* |
| 1672 | * There is a small race that "from" or "to" can be | 1750 | * It seems dagerous to access memcg without css_get(). |
| 1673 | * freed by rmdir, so we use css_tryget(). | 1751 | * But considering how consume_stok works, it's not |
| 1752 | * necessary. If consume_stock success, some charges | ||
| 1753 | * from this memcg are cached on this cpu. So, we | ||
| 1754 | * don't need to call css_get()/css_tryget() before | ||
| 1755 | * calling consume_stock(). | ||
| 1674 | */ | 1756 | */ |
| 1675 | from = mc.from; | 1757 | rcu_read_unlock(); |
| 1676 | to = mc.to; | 1758 | goto done; |
| 1677 | if (from && css_tryget(&from->css)) { | 1759 | } |
| 1678 | if (mem_over_limit->use_hierarchy) | 1760 | /* after here, we may be blocked. we need to get refcnt */ |
| 1679 | do_continue = css_is_ancestor( | 1761 | if (!css_tryget(&mem->css)) { |
| 1680 | &from->css, | 1762 | rcu_read_unlock(); |
| 1681 | &mem_over_limit->css); | 1763 | goto again; |
| 1682 | else | 1764 | } |
| 1683 | do_continue = (from == mem_over_limit); | 1765 | rcu_read_unlock(); |
| 1684 | css_put(&from->css); | 1766 | } |
| 1685 | } | 1767 | |
| 1686 | if (!do_continue && to && css_tryget(&to->css)) { | 1768 | do { |
| 1687 | if (mem_over_limit->use_hierarchy) | 1769 | bool oom_check; |
| 1688 | do_continue = css_is_ancestor( | 1770 | |
| 1689 | &to->css, | 1771 | /* If killed, bypass charge */ |
| 1690 | &mem_over_limit->css); | 1772 | if (fatal_signal_pending(current)) { |
| 1691 | else | 1773 | css_put(&mem->css); |
| 1692 | do_continue = (to == mem_over_limit); | 1774 | goto bypass; |
| 1693 | css_put(&to->css); | ||
| 1694 | } | ||
| 1695 | if (do_continue) { | ||
| 1696 | DEFINE_WAIT(wait); | ||
| 1697 | prepare_to_wait(&mc.waitq, &wait, | ||
| 1698 | TASK_INTERRUPTIBLE); | ||
| 1699 | /* moving charge context might have finished. */ | ||
| 1700 | if (mc.moving_task) | ||
| 1701 | schedule(); | ||
| 1702 | finish_wait(&mc.waitq, &wait); | ||
| 1703 | continue; | ||
| 1704 | } | ||
| 1705 | } | 1775 | } |
| 1706 | 1776 | ||
| 1707 | if (!nr_retries--) { | 1777 | oom_check = false; |
| 1708 | if (!oom) | 1778 | if (oom && !nr_oom_retries) { |
| 1779 | oom_check = true; | ||
| 1780 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
| 1781 | } | ||
| 1782 | |||
| 1783 | ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); | ||
| 1784 | |||
| 1785 | switch (ret) { | ||
| 1786 | case CHARGE_OK: | ||
| 1787 | break; | ||
| 1788 | case CHARGE_RETRY: /* not in OOM situation but retry */ | ||
| 1789 | csize = PAGE_SIZE; | ||
| 1790 | css_put(&mem->css); | ||
| 1791 | mem = NULL; | ||
| 1792 | goto again; | ||
| 1793 | case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ | ||
| 1794 | css_put(&mem->css); | ||
| 1795 | goto nomem; | ||
| 1796 | case CHARGE_NOMEM: /* OOM routine works */ | ||
| 1797 | if (!oom) { | ||
| 1798 | css_put(&mem->css); | ||
| 1709 | goto nomem; | 1799 | goto nomem; |
| 1710 | if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { | ||
| 1711 | nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
| 1712 | continue; | ||
| 1713 | } | 1800 | } |
| 1714 | /* When we reach here, current task is dying .*/ | 1801 | /* If oom, we never return -ENOMEM */ |
| 1802 | nr_oom_retries--; | ||
| 1803 | break; | ||
| 1804 | case CHARGE_OOM_DIE: /* Killed by OOM Killer */ | ||
| 1715 | css_put(&mem->css); | 1805 | css_put(&mem->css); |
| 1716 | goto bypass; | 1806 | goto bypass; |
| 1717 | } | 1807 | } |
| 1718 | } | 1808 | } while (ret != CHARGE_OK); |
| 1809 | |||
| 1719 | if (csize > PAGE_SIZE) | 1810 | if (csize > PAGE_SIZE) |
| 1720 | refill_stock(mem, csize - PAGE_SIZE); | 1811 | refill_stock(mem, csize - PAGE_SIZE); |
| 1812 | css_put(&mem->css); | ||
| 1721 | done: | 1813 | done: |
| 1814 | *memcg = mem; | ||
| 1722 | return 0; | 1815 | return 0; |
| 1723 | nomem: | 1816 | nomem: |
| 1724 | css_put(&mem->css); | 1817 | *memcg = NULL; |
| 1725 | return -ENOMEM; | 1818 | return -ENOMEM; |
| 1726 | bypass: | 1819 | bypass: |
| 1727 | *memcg = NULL; | 1820 | *memcg = NULL; |
| @@ -1740,11 +1833,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, | |||
| 1740 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | 1833 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); |
| 1741 | if (do_swap_account) | 1834 | if (do_swap_account) |
| 1742 | res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); | 1835 | res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); |
| 1743 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
| 1744 | WARN_ON_ONCE(count > INT_MAX); | ||
| 1745 | __css_put(&mem->css, (int)count); | ||
| 1746 | } | 1836 | } |
| 1747 | /* we don't need css_put for root */ | ||
| 1748 | } | 1837 | } |
| 1749 | 1838 | ||
| 1750 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | 1839 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) |
| @@ -1972,10 +2061,9 @@ out: | |||
| 1972 | * < 0 if the cgroup is over its limit | 2061 | * < 0 if the cgroup is over its limit |
| 1973 | */ | 2062 | */ |
| 1974 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | 2063 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, |
| 1975 | gfp_t gfp_mask, enum charge_type ctype, | 2064 | gfp_t gfp_mask, enum charge_type ctype) |
| 1976 | struct mem_cgroup *memcg) | ||
| 1977 | { | 2065 | { |
| 1978 | struct mem_cgroup *mem; | 2066 | struct mem_cgroup *mem = NULL; |
| 1979 | struct page_cgroup *pc; | 2067 | struct page_cgroup *pc; |
| 1980 | int ret; | 2068 | int ret; |
| 1981 | 2069 | ||
| @@ -1985,7 +2073,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
| 1985 | return 0; | 2073 | return 0; |
| 1986 | prefetchw(pc); | 2074 | prefetchw(pc); |
| 1987 | 2075 | ||
| 1988 | mem = memcg; | ||
| 1989 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); | 2076 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); |
| 1990 | if (ret || !mem) | 2077 | if (ret || !mem) |
| 1991 | return ret; | 2078 | return ret; |
| @@ -2013,7 +2100,7 @@ int mem_cgroup_newpage_charge(struct page *page, | |||
| 2013 | if (unlikely(!mm)) | 2100 | if (unlikely(!mm)) |
| 2014 | mm = &init_mm; | 2101 | mm = &init_mm; |
| 2015 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 2102 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
| 2016 | MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); | 2103 | MEM_CGROUP_CHARGE_TYPE_MAPPED); |
| 2017 | } | 2104 | } |
| 2018 | 2105 | ||
| 2019 | static void | 2106 | static void |
| @@ -2023,7 +2110,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
| 2023 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 2110 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
| 2024 | gfp_t gfp_mask) | 2111 | gfp_t gfp_mask) |
| 2025 | { | 2112 | { |
| 2026 | struct mem_cgroup *mem = NULL; | ||
| 2027 | int ret; | 2113 | int ret; |
| 2028 | 2114 | ||
| 2029 | if (mem_cgroup_disabled()) | 2115 | if (mem_cgroup_disabled()) |
| @@ -2044,7 +2130,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
| 2044 | if (!(gfp_mask & __GFP_WAIT)) { | 2130 | if (!(gfp_mask & __GFP_WAIT)) { |
| 2045 | struct page_cgroup *pc; | 2131 | struct page_cgroup *pc; |
| 2046 | 2132 | ||
| 2047 | |||
| 2048 | pc = lookup_page_cgroup(page); | 2133 | pc = lookup_page_cgroup(page); |
| 2049 | if (!pc) | 2134 | if (!pc) |
| 2050 | return 0; | 2135 | return 0; |
| @@ -2056,22 +2141,24 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
| 2056 | unlock_page_cgroup(pc); | 2141 | unlock_page_cgroup(pc); |
| 2057 | } | 2142 | } |
| 2058 | 2143 | ||
| 2059 | if (unlikely(!mm && !mem)) | 2144 | if (unlikely(!mm)) |
| 2060 | mm = &init_mm; | 2145 | mm = &init_mm; |
| 2061 | 2146 | ||
| 2062 | if (page_is_file_cache(page)) | 2147 | if (page_is_file_cache(page)) |
| 2063 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 2148 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
| 2064 | MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); | 2149 | MEM_CGROUP_CHARGE_TYPE_CACHE); |
| 2065 | 2150 | ||
| 2066 | /* shmem */ | 2151 | /* shmem */ |
| 2067 | if (PageSwapCache(page)) { | 2152 | if (PageSwapCache(page)) { |
| 2153 | struct mem_cgroup *mem = NULL; | ||
| 2154 | |||
| 2068 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); | 2155 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); |
| 2069 | if (!ret) | 2156 | if (!ret) |
| 2070 | __mem_cgroup_commit_charge_swapin(page, mem, | 2157 | __mem_cgroup_commit_charge_swapin(page, mem, |
| 2071 | MEM_CGROUP_CHARGE_TYPE_SHMEM); | 2158 | MEM_CGROUP_CHARGE_TYPE_SHMEM); |
| 2072 | } else | 2159 | } else |
| 2073 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, | 2160 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, |
| 2074 | MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); | 2161 | MEM_CGROUP_CHARGE_TYPE_SHMEM); |
| 2075 | 2162 | ||
| 2076 | return ret; | 2163 | return ret; |
| 2077 | } | 2164 | } |
| @@ -2107,7 +2194,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
| 2107 | goto charge_cur_mm; | 2194 | goto charge_cur_mm; |
| 2108 | *ptr = mem; | 2195 | *ptr = mem; |
| 2109 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); | 2196 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); |
| 2110 | /* drop extra refcnt from tryget */ | ||
| 2111 | css_put(&mem->css); | 2197 | css_put(&mem->css); |
| 2112 | return ret; | 2198 | return ret; |
| 2113 | charge_cur_mm: | 2199 | charge_cur_mm: |
| @@ -2238,7 +2324,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 2238 | { | 2324 | { |
| 2239 | struct page_cgroup *pc; | 2325 | struct page_cgroup *pc; |
| 2240 | struct mem_cgroup *mem = NULL; | 2326 | struct mem_cgroup *mem = NULL; |
| 2241 | struct mem_cgroup_per_zone *mz; | ||
| 2242 | 2327 | ||
| 2243 | if (mem_cgroup_disabled()) | 2328 | if (mem_cgroup_disabled()) |
| 2244 | return NULL; | 2329 | return NULL; |
| @@ -2278,10 +2363,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 2278 | break; | 2363 | break; |
| 2279 | } | 2364 | } |
| 2280 | 2365 | ||
| 2281 | if (!mem_cgroup_is_root(mem)) | ||
| 2282 | __do_uncharge(mem, ctype); | ||
| 2283 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
| 2284 | mem_cgroup_swap_statistics(mem, true); | ||
| 2285 | mem_cgroup_charge_statistics(mem, pc, false); | 2366 | mem_cgroup_charge_statistics(mem, pc, false); |
| 2286 | 2367 | ||
| 2287 | ClearPageCgroupUsed(pc); | 2368 | ClearPageCgroupUsed(pc); |
| @@ -2292,13 +2373,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 2292 | * special functions. | 2373 | * special functions. |
| 2293 | */ | 2374 | */ |
| 2294 | 2375 | ||
| 2295 | mz = page_cgroup_zoneinfo(pc); | ||
| 2296 | unlock_page_cgroup(pc); | 2376 | unlock_page_cgroup(pc); |
| 2297 | 2377 | /* | |
| 2378 | * even after unlock, we have mem->res.usage here and this memcg | ||
| 2379 | * will never be freed. | ||
| 2380 | */ | ||
| 2298 | memcg_check_events(mem, page); | 2381 | memcg_check_events(mem, page); |
| 2299 | /* at swapout, this memcg will be accessed to record to swap */ | 2382 | if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { |
| 2300 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2383 | mem_cgroup_swap_statistics(mem, true); |
| 2301 | css_put(&mem->css); | 2384 | mem_cgroup_get(mem); |
| 2385 | } | ||
| 2386 | if (!mem_cgroup_is_root(mem)) | ||
| 2387 | __do_uncharge(mem, ctype); | ||
| 2302 | 2388 | ||
| 2303 | return mem; | 2389 | return mem; |
| 2304 | 2390 | ||
| @@ -2385,13 +2471,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
| 2385 | 2471 | ||
| 2386 | memcg = __mem_cgroup_uncharge_common(page, ctype); | 2472 | memcg = __mem_cgroup_uncharge_common(page, ctype); |
| 2387 | 2473 | ||
| 2388 | /* record memcg information */ | 2474 | /* |
| 2389 | if (do_swap_account && swapout && memcg) { | 2475 | * record memcg information, if swapout && memcg != NULL, |
| 2476 | * mem_cgroup_get() was called in uncharge(). | ||
| 2477 | */ | ||
| 2478 | if (do_swap_account && swapout && memcg) | ||
| 2390 | swap_cgroup_record(ent, css_id(&memcg->css)); | 2479 | swap_cgroup_record(ent, css_id(&memcg->css)); |
| 2391 | mem_cgroup_get(memcg); | ||
| 2392 | } | ||
| 2393 | if (swapout && memcg) | ||
| 2394 | css_put(&memcg->css); | ||
| 2395 | } | 2480 | } |
| 2396 | #endif | 2481 | #endif |
| 2397 | 2482 | ||
| @@ -2469,7 +2554,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
| 2469 | */ | 2554 | */ |
| 2470 | if (!mem_cgroup_is_root(to)) | 2555 | if (!mem_cgroup_is_root(to)) |
| 2471 | res_counter_uncharge(&to->res, PAGE_SIZE); | 2556 | res_counter_uncharge(&to->res, PAGE_SIZE); |
| 2472 | css_put(&to->css); | ||
| 2473 | } | 2557 | } |
| 2474 | return 0; | 2558 | return 0; |
| 2475 | } | 2559 | } |
| @@ -2604,11 +2688,8 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, | |||
| 2604 | ClearPageCgroupMigration(pc); | 2688 | ClearPageCgroupMigration(pc); |
| 2605 | unlock_page_cgroup(pc); | 2689 | unlock_page_cgroup(pc); |
| 2606 | 2690 | ||
| 2607 | if (unused != oldpage) | ||
| 2608 | pc = lookup_page_cgroup(unused); | ||
| 2609 | __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); | 2691 | __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); |
| 2610 | 2692 | ||
| 2611 | pc = lookup_page_cgroup(used); | ||
| 2612 | /* | 2693 | /* |
| 2613 | * If a page is a file cache, radix-tree replacement is very atomic | 2694 | * If a page is a file cache, radix-tree replacement is very atomic |
| 2614 | * and we can skip this check. When it was an Anon page, its mapcount | 2695 | * and we can skip this check. When it was an Anon page, its mapcount |
| @@ -2784,8 +2865,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
| 2784 | } | 2865 | } |
| 2785 | 2866 | ||
| 2786 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | 2867 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, |
| 2787 | gfp_t gfp_mask, int nid, | 2868 | gfp_t gfp_mask) |
| 2788 | int zid) | ||
| 2789 | { | 2869 | { |
| 2790 | unsigned long nr_reclaimed = 0; | 2870 | unsigned long nr_reclaimed = 0; |
| 2791 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | 2871 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; |
| @@ -2797,7 +2877,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
| 2797 | if (order > 0) | 2877 | if (order > 0) |
| 2798 | return 0; | 2878 | return 0; |
| 2799 | 2879 | ||
| 2800 | mctz = soft_limit_tree_node_zone(nid, zid); | 2880 | mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); |
| 2801 | /* | 2881 | /* |
| 2802 | * This loop can run a while, specially if mem_cgroup's continuously | 2882 | * This loop can run a while, specially if mem_cgroup's continuously |
| 2803 | * keep exceeding their soft limit and putting the system under | 2883 | * keep exceeding their soft limit and putting the system under |
| @@ -3752,8 +3832,6 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp, | |||
| 3752 | return 0; | 3832 | return 0; |
| 3753 | } | 3833 | } |
| 3754 | 3834 | ||
| 3755 | /* | ||
| 3756 | */ | ||
| 3757 | static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | 3835 | static int mem_cgroup_oom_control_write(struct cgroup *cgrp, |
| 3758 | struct cftype *cft, u64 val) | 3836 | struct cftype *cft, u64 val) |
| 3759 | { | 3837 | { |
| @@ -4173,9 +4251,6 @@ static int mem_cgroup_do_precharge(unsigned long count) | |||
| 4173 | goto one_by_one; | 4251 | goto one_by_one; |
| 4174 | } | 4252 | } |
| 4175 | mc.precharge += count; | 4253 | mc.precharge += count; |
| 4176 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
| 4177 | WARN_ON_ONCE(count > INT_MAX); | ||
| 4178 | __css_get(&mem->css, (int)count); | ||
| 4179 | return ret; | 4254 | return ret; |
| 4180 | } | 4255 | } |
| 4181 | one_by_one: | 4256 | one_by_one: |
| @@ -4393,11 +4468,13 @@ static int mem_cgroup_precharge_mc(struct mm_struct *mm) | |||
| 4393 | 4468 | ||
| 4394 | static void mem_cgroup_clear_mc(void) | 4469 | static void mem_cgroup_clear_mc(void) |
| 4395 | { | 4470 | { |
| 4471 | struct mem_cgroup *from = mc.from; | ||
| 4472 | struct mem_cgroup *to = mc.to; | ||
| 4473 | |||
| 4396 | /* we must uncharge all the leftover precharges from mc.to */ | 4474 | /* we must uncharge all the leftover precharges from mc.to */ |
| 4397 | if (mc.precharge) { | 4475 | if (mc.precharge) { |
| 4398 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); | 4476 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); |
| 4399 | mc.precharge = 0; | 4477 | mc.precharge = 0; |
| 4400 | memcg_oom_recover(mc.to); | ||
| 4401 | } | 4478 | } |
| 4402 | /* | 4479 | /* |
| 4403 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so | 4480 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so |
| @@ -4406,11 +4483,9 @@ static void mem_cgroup_clear_mc(void) | |||
| 4406 | if (mc.moved_charge) { | 4483 | if (mc.moved_charge) { |
| 4407 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); | 4484 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); |
| 4408 | mc.moved_charge = 0; | 4485 | mc.moved_charge = 0; |
| 4409 | memcg_oom_recover(mc.from); | ||
| 4410 | } | 4486 | } |
| 4411 | /* we must fixup refcnts and charges */ | 4487 | /* we must fixup refcnts and charges */ |
| 4412 | if (mc.moved_swap) { | 4488 | if (mc.moved_swap) { |
| 4413 | WARN_ON_ONCE(mc.moved_swap > INT_MAX); | ||
| 4414 | /* uncharge swap account from the old cgroup */ | 4489 | /* uncharge swap account from the old cgroup */ |
| 4415 | if (!mem_cgroup_is_root(mc.from)) | 4490 | if (!mem_cgroup_is_root(mc.from)) |
| 4416 | res_counter_uncharge(&mc.from->memsw, | 4491 | res_counter_uncharge(&mc.from->memsw, |
| @@ -4424,16 +4499,18 @@ static void mem_cgroup_clear_mc(void) | |||
| 4424 | */ | 4499 | */ |
| 4425 | res_counter_uncharge(&mc.to->res, | 4500 | res_counter_uncharge(&mc.to->res, |
| 4426 | PAGE_SIZE * mc.moved_swap); | 4501 | PAGE_SIZE * mc.moved_swap); |
| 4427 | VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags)); | ||
| 4428 | __css_put(&mc.to->css, mc.moved_swap); | ||
| 4429 | } | 4502 | } |
| 4430 | /* we've already done mem_cgroup_get(mc.to) */ | 4503 | /* we've already done mem_cgroup_get(mc.to) */ |
| 4431 | 4504 | ||
| 4432 | mc.moved_swap = 0; | 4505 | mc.moved_swap = 0; |
| 4433 | } | 4506 | } |
| 4507 | spin_lock(&mc.lock); | ||
| 4434 | mc.from = NULL; | 4508 | mc.from = NULL; |
| 4435 | mc.to = NULL; | 4509 | mc.to = NULL; |
| 4436 | mc.moving_task = NULL; | 4510 | mc.moving_task = NULL; |
| 4511 | spin_unlock(&mc.lock); | ||
| 4512 | memcg_oom_recover(from); | ||
| 4513 | memcg_oom_recover(to); | ||
| 4437 | wake_up_all(&mc.waitq); | 4514 | wake_up_all(&mc.waitq); |
| 4438 | } | 4515 | } |
| 4439 | 4516 | ||
| @@ -4462,12 +4539,14 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
| 4462 | VM_BUG_ON(mc.moved_charge); | 4539 | VM_BUG_ON(mc.moved_charge); |
| 4463 | VM_BUG_ON(mc.moved_swap); | 4540 | VM_BUG_ON(mc.moved_swap); |
| 4464 | VM_BUG_ON(mc.moving_task); | 4541 | VM_BUG_ON(mc.moving_task); |
| 4542 | spin_lock(&mc.lock); | ||
| 4465 | mc.from = from; | 4543 | mc.from = from; |
| 4466 | mc.to = mem; | 4544 | mc.to = mem; |
| 4467 | mc.precharge = 0; | 4545 | mc.precharge = 0; |
| 4468 | mc.moved_charge = 0; | 4546 | mc.moved_charge = 0; |
| 4469 | mc.moved_swap = 0; | 4547 | mc.moved_swap = 0; |
| 4470 | mc.moving_task = current; | 4548 | mc.moving_task = current; |
| 4549 | spin_unlock(&mc.lock); | ||
| 4471 | 4550 | ||
| 4472 | ret = mem_cgroup_precharge_mc(mm); | 4551 | ret = mem_cgroup_precharge_mc(mm); |
| 4473 | if (ret) | 4552 | if (ret) |
