diff options
author | Chris Metcalf <cmetcalf@tilera.com> | 2010-08-13 19:59:15 -0400 |
---|---|---|
committer | Chris Metcalf <cmetcalf@tilera.com> | 2010-08-13 19:59:15 -0400 |
commit | 7d72e6fa56c4100b9669efe0044f77ed9eb785a1 (patch) | |
tree | 5e90bf4969809a1ab20b97432b85be20ccfaa1f4 /mm/memcontrol.c | |
parent | ba00376b0b13f234d839541a7b36a5bf5c2a4036 (diff) | |
parent | 2be1f3a73dd02e38e181cf5abacb3d45a6a2d6b8 (diff) |
Merge branch 'master' into for-linus
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 407 |
1 files changed, 243 insertions, 164 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0576e9e64586..3eed583895a6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/mm_inline.h> | 47 | #include <linux/mm_inline.h> |
48 | #include <linux/page_cgroup.h> | 48 | #include <linux/page_cgroup.h> |
49 | #include <linux/cpu.h> | 49 | #include <linux/cpu.h> |
50 | #include <linux/oom.h> | ||
50 | #include "internal.h" | 51 | #include "internal.h" |
51 | 52 | ||
52 | #include <asm/uaccess.h> | 53 | #include <asm/uaccess.h> |
@@ -268,6 +269,7 @@ enum move_type { | |||
268 | 269 | ||
269 | /* "mc" and its members are protected by cgroup_mutex */ | 270 | /* "mc" and its members are protected by cgroup_mutex */ |
270 | static struct move_charge_struct { | 271 | static struct move_charge_struct { |
272 | spinlock_t lock; /* for from, to, moving_task */ | ||
271 | struct mem_cgroup *from; | 273 | struct mem_cgroup *from; |
272 | struct mem_cgroup *to; | 274 | struct mem_cgroup *to; |
273 | unsigned long precharge; | 275 | unsigned long precharge; |
@@ -276,6 +278,7 @@ static struct move_charge_struct { | |||
276 | struct task_struct *moving_task; /* a task moving charges */ | 278 | struct task_struct *moving_task; /* a task moving charges */ |
277 | wait_queue_head_t waitq; /* a waitq for other context */ | 279 | wait_queue_head_t waitq; /* a waitq for other context */ |
278 | } mc = { | 280 | } mc = { |
281 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), | ||
279 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | 282 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), |
280 | }; | 283 | }; |
281 | 284 | ||
@@ -836,12 +839,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
836 | { | 839 | { |
837 | int ret; | 840 | int ret; |
838 | struct mem_cgroup *curr = NULL; | 841 | struct mem_cgroup *curr = NULL; |
842 | struct task_struct *p; | ||
839 | 843 | ||
840 | task_lock(task); | 844 | p = find_lock_task_mm(task); |
841 | rcu_read_lock(); | 845 | if (!p) |
842 | curr = try_get_mem_cgroup_from_mm(task->mm); | 846 | return 0; |
843 | rcu_read_unlock(); | 847 | curr = try_get_mem_cgroup_from_mm(p->mm); |
844 | task_unlock(task); | 848 | task_unlock(p); |
845 | if (!curr) | 849 | if (!curr) |
846 | return 0; | 850 | return 0; |
847 | /* | 851 | /* |
@@ -915,7 +919,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, | |||
915 | struct zone *zone, | 919 | struct zone *zone, |
916 | enum lru_list lru) | 920 | enum lru_list lru) |
917 | { | 921 | { |
918 | int nid = zone->zone_pgdat->node_id; | 922 | int nid = zone_to_nid(zone); |
919 | int zid = zone_idx(zone); | 923 | int zid = zone_idx(zone); |
920 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); | 924 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
921 | 925 | ||
@@ -925,7 +929,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, | |||
925 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, | 929 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, |
926 | struct zone *zone) | 930 | struct zone *zone) |
927 | { | 931 | { |
928 | int nid = zone->zone_pgdat->node_id; | 932 | int nid = zone_to_nid(zone); |
929 | int zid = zone_idx(zone); | 933 | int zid = zone_idx(zone); |
930 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); | 934 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
931 | 935 | ||
@@ -970,7 +974,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
970 | LIST_HEAD(pc_list); | 974 | LIST_HEAD(pc_list); |
971 | struct list_head *src; | 975 | struct list_head *src; |
972 | struct page_cgroup *pc, *tmp; | 976 | struct page_cgroup *pc, *tmp; |
973 | int nid = z->zone_pgdat->node_id; | 977 | int nid = zone_to_nid(z); |
974 | int zid = zone_idx(z); | 978 | int zid = zone_idx(z); |
975 | struct mem_cgroup_per_zone *mz; | 979 | struct mem_cgroup_per_zone *mz; |
976 | int lru = LRU_FILE * file + active; | 980 | int lru = LRU_FILE * file + active; |
@@ -1047,6 +1051,47 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) | |||
1047 | return swappiness; | 1051 | return swappiness; |
1048 | } | 1052 | } |
1049 | 1053 | ||
1054 | /* A routine for testing mem is not under move_account */ | ||
1055 | |||
1056 | static bool mem_cgroup_under_move(struct mem_cgroup *mem) | ||
1057 | { | ||
1058 | struct mem_cgroup *from; | ||
1059 | struct mem_cgroup *to; | ||
1060 | bool ret = false; | ||
1061 | /* | ||
1062 | * Unlike task_move routines, we access mc.to, mc.from not under | ||
1063 | * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. | ||
1064 | */ | ||
1065 | spin_lock(&mc.lock); | ||
1066 | from = mc.from; | ||
1067 | to = mc.to; | ||
1068 | if (!from) | ||
1069 | goto unlock; | ||
1070 | if (from == mem || to == mem | ||
1071 | || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) | ||
1072 | || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) | ||
1073 | ret = true; | ||
1074 | unlock: | ||
1075 | spin_unlock(&mc.lock); | ||
1076 | return ret; | ||
1077 | } | ||
1078 | |||
1079 | static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) | ||
1080 | { | ||
1081 | if (mc.moving_task && current != mc.moving_task) { | ||
1082 | if (mem_cgroup_under_move(mem)) { | ||
1083 | DEFINE_WAIT(wait); | ||
1084 | prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); | ||
1085 | /* moving charge context might have finished. */ | ||
1086 | if (mc.moving_task) | ||
1087 | schedule(); | ||
1088 | finish_wait(&mc.waitq, &wait); | ||
1089 | return true; | ||
1090 | } | ||
1091 | } | ||
1092 | return false; | ||
1093 | } | ||
1094 | |||
1050 | static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) | 1095 | static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) |
1051 | { | 1096 | { |
1052 | int *val = data; | 1097 | int *val = data; |
@@ -1255,8 +1300,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1255 | /* we use swappiness of local cgroup */ | 1300 | /* we use swappiness of local cgroup */ |
1256 | if (check_soft) | 1301 | if (check_soft) |
1257 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, | 1302 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, |
1258 | noswap, get_swappiness(victim), zone, | 1303 | noswap, get_swappiness(victim), zone); |
1259 | zone->zone_pgdat->node_id); | ||
1260 | else | 1304 | else |
1261 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | 1305 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, |
1262 | noswap, get_swappiness(victim)); | 1306 | noswap, get_swappiness(victim)); |
@@ -1363,7 +1407,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem) | |||
1363 | 1407 | ||
1364 | static void memcg_oom_recover(struct mem_cgroup *mem) | 1408 | static void memcg_oom_recover(struct mem_cgroup *mem) |
1365 | { | 1409 | { |
1366 | if (atomic_read(&mem->oom_lock)) | 1410 | if (mem && atomic_read(&mem->oom_lock)) |
1367 | memcg_wakeup_oom(mem); | 1411 | memcg_wakeup_oom(mem); |
1368 | } | 1412 | } |
1369 | 1413 | ||
@@ -1575,16 +1619,83 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | |||
1575 | return NOTIFY_OK; | 1619 | return NOTIFY_OK; |
1576 | } | 1620 | } |
1577 | 1621 | ||
1622 | |||
1623 | /* See __mem_cgroup_try_charge() for details */ | ||
1624 | enum { | ||
1625 | CHARGE_OK, /* success */ | ||
1626 | CHARGE_RETRY, /* need to retry but retry is not bad */ | ||
1627 | CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ | ||
1628 | CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ | ||
1629 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ | ||
1630 | }; | ||
1631 | |||
1632 | static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | ||
1633 | int csize, bool oom_check) | ||
1634 | { | ||
1635 | struct mem_cgroup *mem_over_limit; | ||
1636 | struct res_counter *fail_res; | ||
1637 | unsigned long flags = 0; | ||
1638 | int ret; | ||
1639 | |||
1640 | ret = res_counter_charge(&mem->res, csize, &fail_res); | ||
1641 | |||
1642 | if (likely(!ret)) { | ||
1643 | if (!do_swap_account) | ||
1644 | return CHARGE_OK; | ||
1645 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); | ||
1646 | if (likely(!ret)) | ||
1647 | return CHARGE_OK; | ||
1648 | |||
1649 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | ||
1650 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | ||
1651 | } else | ||
1652 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); | ||
1653 | |||
1654 | if (csize > PAGE_SIZE) /* change csize and retry */ | ||
1655 | return CHARGE_RETRY; | ||
1656 | |||
1657 | if (!(gfp_mask & __GFP_WAIT)) | ||
1658 | return CHARGE_WOULDBLOCK; | ||
1659 | |||
1660 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, | ||
1661 | gfp_mask, flags); | ||
1662 | /* | ||
1663 | * try_to_free_mem_cgroup_pages() might not give us a full | ||
1664 | * picture of reclaim. Some pages are reclaimed and might be | ||
1665 | * moved to swap cache or just unmapped from the cgroup. | ||
1666 | * Check the limit again to see if the reclaim reduced the | ||
1667 | * current usage of the cgroup before giving up | ||
1668 | */ | ||
1669 | if (ret || mem_cgroup_check_under_limit(mem_over_limit)) | ||
1670 | return CHARGE_RETRY; | ||
1671 | |||
1672 | /* | ||
1673 | * At task move, charge accounts can be doubly counted. So, it's | ||
1674 | * better to wait until the end of task_move if something is going on. | ||
1675 | */ | ||
1676 | if (mem_cgroup_wait_acct_move(mem_over_limit)) | ||
1677 | return CHARGE_RETRY; | ||
1678 | |||
1679 | /* If we don't need to call oom-killer at el, return immediately */ | ||
1680 | if (!oom_check) | ||
1681 | return CHARGE_NOMEM; | ||
1682 | /* check OOM */ | ||
1683 | if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) | ||
1684 | return CHARGE_OOM_DIE; | ||
1685 | |||
1686 | return CHARGE_RETRY; | ||
1687 | } | ||
1688 | |||
1578 | /* | 1689 | /* |
1579 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 1690 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
1580 | * oom-killer can be invoked. | 1691 | * oom-killer can be invoked. |
1581 | */ | 1692 | */ |
1582 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1693 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
1583 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) | 1694 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) |
1584 | { | 1695 | { |
1585 | struct mem_cgroup *mem, *mem_over_limit; | 1696 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1586 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1697 | struct mem_cgroup *mem = NULL; |
1587 | struct res_counter *fail_res; | 1698 | int ret; |
1588 | int csize = CHARGE_SIZE; | 1699 | int csize = CHARGE_SIZE; |
1589 | 1700 | ||
1590 | /* | 1701 | /* |
@@ -1602,126 +1713,108 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1602 | * thread group leader migrates. It's possible that mm is not | 1713 | * thread group leader migrates. It's possible that mm is not |
1603 | * set, if so charge the init_mm (happens for pagecache usage). | 1714 | * set, if so charge the init_mm (happens for pagecache usage). |
1604 | */ | 1715 | */ |
1605 | mem = *memcg; | 1716 | if (!*memcg && !mm) |
1606 | if (likely(!mem)) { | 1717 | goto bypass; |
1607 | mem = try_get_mem_cgroup_from_mm(mm); | 1718 | again: |
1608 | *memcg = mem; | 1719 | if (*memcg) { /* css should be a valid one */ |
1609 | } else { | 1720 | mem = *memcg; |
1610 | css_get(&mem->css); | 1721 | VM_BUG_ON(css_is_removed(&mem->css)); |
1611 | } | 1722 | if (mem_cgroup_is_root(mem)) |
1612 | if (unlikely(!mem)) | 1723 | goto done; |
1613 | return 0; | ||
1614 | |||
1615 | VM_BUG_ON(css_is_removed(&mem->css)); | ||
1616 | if (mem_cgroup_is_root(mem)) | ||
1617 | goto done; | ||
1618 | |||
1619 | while (1) { | ||
1620 | int ret = 0; | ||
1621 | unsigned long flags = 0; | ||
1622 | |||
1623 | if (consume_stock(mem)) | 1724 | if (consume_stock(mem)) |
1624 | goto done; | 1725 | goto done; |
1726 | css_get(&mem->css); | ||
1727 | } else { | ||
1728 | struct task_struct *p; | ||
1625 | 1729 | ||
1626 | ret = res_counter_charge(&mem->res, csize, &fail_res); | 1730 | rcu_read_lock(); |
1627 | if (likely(!ret)) { | 1731 | p = rcu_dereference(mm->owner); |
1628 | if (!do_swap_account) | 1732 | VM_BUG_ON(!p); |
1629 | break; | ||
1630 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); | ||
1631 | if (likely(!ret)) | ||
1632 | break; | ||
1633 | /* mem+swap counter fails */ | ||
1634 | res_counter_uncharge(&mem->res, csize); | ||
1635 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | ||
1636 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | ||
1637 | memsw); | ||
1638 | } else | ||
1639 | /* mem counter fails */ | ||
1640 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | ||
1641 | res); | ||
1642 | |||
1643 | /* reduce request size and retry */ | ||
1644 | if (csize > PAGE_SIZE) { | ||
1645 | csize = PAGE_SIZE; | ||
1646 | continue; | ||
1647 | } | ||
1648 | if (!(gfp_mask & __GFP_WAIT)) | ||
1649 | goto nomem; | ||
1650 | |||
1651 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, | ||
1652 | gfp_mask, flags); | ||
1653 | if (ret) | ||
1654 | continue; | ||
1655 | |||
1656 | /* | 1733 | /* |
1657 | * try_to_free_mem_cgroup_pages() might not give us a full | 1734 | * because we don't have task_lock(), "p" can exit while |
1658 | * picture of reclaim. Some pages are reclaimed and might be | 1735 | * we're here. In that case, "mem" can point to root |
1659 | * moved to swap cache or just unmapped from the cgroup. | 1736 | * cgroup but never be NULL. (and task_struct itself is freed |
1660 | * Check the limit again to see if the reclaim reduced the | 1737 | * by RCU, cgroup itself is RCU safe.) Then, we have small |
1661 | * current usage of the cgroup before giving up | 1738 | * risk here to get wrong cgroup. But such kind of mis-account |
1662 | * | 1739 | * by race always happens because we don't have cgroup_mutex(). |
1740 | * It's overkill and we allow that small race, here. | ||
1663 | */ | 1741 | */ |
1664 | if (mem_cgroup_check_under_limit(mem_over_limit)) | 1742 | mem = mem_cgroup_from_task(p); |
1665 | continue; | 1743 | VM_BUG_ON(!mem); |
1666 | 1744 | if (mem_cgroup_is_root(mem)) { | |
1667 | /* try to avoid oom while someone is moving charge */ | 1745 | rcu_read_unlock(); |
1668 | if (mc.moving_task && current != mc.moving_task) { | 1746 | goto done; |
1669 | struct mem_cgroup *from, *to; | 1747 | } |
1670 | bool do_continue = false; | 1748 | if (consume_stock(mem)) { |
1671 | /* | 1749 | /* |
1672 | * There is a small race that "from" or "to" can be | 1750 | * It seems dagerous to access memcg without css_get(). |
1673 | * freed by rmdir, so we use css_tryget(). | 1751 | * But considering how consume_stok works, it's not |
1752 | * necessary. If consume_stock success, some charges | ||
1753 | * from this memcg are cached on this cpu. So, we | ||
1754 | * don't need to call css_get()/css_tryget() before | ||
1755 | * calling consume_stock(). | ||
1674 | */ | 1756 | */ |
1675 | from = mc.from; | 1757 | rcu_read_unlock(); |
1676 | to = mc.to; | 1758 | goto done; |
1677 | if (from && css_tryget(&from->css)) { | 1759 | } |
1678 | if (mem_over_limit->use_hierarchy) | 1760 | /* after here, we may be blocked. we need to get refcnt */ |
1679 | do_continue = css_is_ancestor( | 1761 | if (!css_tryget(&mem->css)) { |
1680 | &from->css, | 1762 | rcu_read_unlock(); |
1681 | &mem_over_limit->css); | 1763 | goto again; |
1682 | else | 1764 | } |
1683 | do_continue = (from == mem_over_limit); | 1765 | rcu_read_unlock(); |
1684 | css_put(&from->css); | 1766 | } |
1685 | } | 1767 | |
1686 | if (!do_continue && to && css_tryget(&to->css)) { | 1768 | do { |
1687 | if (mem_over_limit->use_hierarchy) | 1769 | bool oom_check; |
1688 | do_continue = css_is_ancestor( | 1770 | |
1689 | &to->css, | 1771 | /* If killed, bypass charge */ |
1690 | &mem_over_limit->css); | 1772 | if (fatal_signal_pending(current)) { |
1691 | else | 1773 | css_put(&mem->css); |
1692 | do_continue = (to == mem_over_limit); | 1774 | goto bypass; |
1693 | css_put(&to->css); | ||
1694 | } | ||
1695 | if (do_continue) { | ||
1696 | DEFINE_WAIT(wait); | ||
1697 | prepare_to_wait(&mc.waitq, &wait, | ||
1698 | TASK_INTERRUPTIBLE); | ||
1699 | /* moving charge context might have finished. */ | ||
1700 | if (mc.moving_task) | ||
1701 | schedule(); | ||
1702 | finish_wait(&mc.waitq, &wait); | ||
1703 | continue; | ||
1704 | } | ||
1705 | } | 1775 | } |
1706 | 1776 | ||
1707 | if (!nr_retries--) { | 1777 | oom_check = false; |
1708 | if (!oom) | 1778 | if (oom && !nr_oom_retries) { |
1779 | oom_check = true; | ||
1780 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
1781 | } | ||
1782 | |||
1783 | ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); | ||
1784 | |||
1785 | switch (ret) { | ||
1786 | case CHARGE_OK: | ||
1787 | break; | ||
1788 | case CHARGE_RETRY: /* not in OOM situation but retry */ | ||
1789 | csize = PAGE_SIZE; | ||
1790 | css_put(&mem->css); | ||
1791 | mem = NULL; | ||
1792 | goto again; | ||
1793 | case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ | ||
1794 | css_put(&mem->css); | ||
1795 | goto nomem; | ||
1796 | case CHARGE_NOMEM: /* OOM routine works */ | ||
1797 | if (!oom) { | ||
1798 | css_put(&mem->css); | ||
1709 | goto nomem; | 1799 | goto nomem; |
1710 | if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { | ||
1711 | nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
1712 | continue; | ||
1713 | } | 1800 | } |
1714 | /* When we reach here, current task is dying .*/ | 1801 | /* If oom, we never return -ENOMEM */ |
1802 | nr_oom_retries--; | ||
1803 | break; | ||
1804 | case CHARGE_OOM_DIE: /* Killed by OOM Killer */ | ||
1715 | css_put(&mem->css); | 1805 | css_put(&mem->css); |
1716 | goto bypass; | 1806 | goto bypass; |
1717 | } | 1807 | } |
1718 | } | 1808 | } while (ret != CHARGE_OK); |
1809 | |||
1719 | if (csize > PAGE_SIZE) | 1810 | if (csize > PAGE_SIZE) |
1720 | refill_stock(mem, csize - PAGE_SIZE); | 1811 | refill_stock(mem, csize - PAGE_SIZE); |
1812 | css_put(&mem->css); | ||
1721 | done: | 1813 | done: |
1814 | *memcg = mem; | ||
1722 | return 0; | 1815 | return 0; |
1723 | nomem: | 1816 | nomem: |
1724 | css_put(&mem->css); | 1817 | *memcg = NULL; |
1725 | return -ENOMEM; | 1818 | return -ENOMEM; |
1726 | bypass: | 1819 | bypass: |
1727 | *memcg = NULL; | 1820 | *memcg = NULL; |
@@ -1740,11 +1833,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, | |||
1740 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | 1833 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); |
1741 | if (do_swap_account) | 1834 | if (do_swap_account) |
1742 | res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); | 1835 | res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); |
1743 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
1744 | WARN_ON_ONCE(count > INT_MAX); | ||
1745 | __css_put(&mem->css, (int)count); | ||
1746 | } | 1836 | } |
1747 | /* we don't need css_put for root */ | ||
1748 | } | 1837 | } |
1749 | 1838 | ||
1750 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | 1839 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) |
@@ -1972,10 +2061,9 @@ out: | |||
1972 | * < 0 if the cgroup is over its limit | 2061 | * < 0 if the cgroup is over its limit |
1973 | */ | 2062 | */ |
1974 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | 2063 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, |
1975 | gfp_t gfp_mask, enum charge_type ctype, | 2064 | gfp_t gfp_mask, enum charge_type ctype) |
1976 | struct mem_cgroup *memcg) | ||
1977 | { | 2065 | { |
1978 | struct mem_cgroup *mem; | 2066 | struct mem_cgroup *mem = NULL; |
1979 | struct page_cgroup *pc; | 2067 | struct page_cgroup *pc; |
1980 | int ret; | 2068 | int ret; |
1981 | 2069 | ||
@@ -1985,7 +2073,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
1985 | return 0; | 2073 | return 0; |
1986 | prefetchw(pc); | 2074 | prefetchw(pc); |
1987 | 2075 | ||
1988 | mem = memcg; | ||
1989 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); | 2076 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); |
1990 | if (ret || !mem) | 2077 | if (ret || !mem) |
1991 | return ret; | 2078 | return ret; |
@@ -2013,7 +2100,7 @@ int mem_cgroup_newpage_charge(struct page *page, | |||
2013 | if (unlikely(!mm)) | 2100 | if (unlikely(!mm)) |
2014 | mm = &init_mm; | 2101 | mm = &init_mm; |
2015 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 2102 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
2016 | MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); | 2103 | MEM_CGROUP_CHARGE_TYPE_MAPPED); |
2017 | } | 2104 | } |
2018 | 2105 | ||
2019 | static void | 2106 | static void |
@@ -2023,7 +2110,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
2023 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 2110 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
2024 | gfp_t gfp_mask) | 2111 | gfp_t gfp_mask) |
2025 | { | 2112 | { |
2026 | struct mem_cgroup *mem = NULL; | ||
2027 | int ret; | 2113 | int ret; |
2028 | 2114 | ||
2029 | if (mem_cgroup_disabled()) | 2115 | if (mem_cgroup_disabled()) |
@@ -2044,7 +2130,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2044 | if (!(gfp_mask & __GFP_WAIT)) { | 2130 | if (!(gfp_mask & __GFP_WAIT)) { |
2045 | struct page_cgroup *pc; | 2131 | struct page_cgroup *pc; |
2046 | 2132 | ||
2047 | |||
2048 | pc = lookup_page_cgroup(page); | 2133 | pc = lookup_page_cgroup(page); |
2049 | if (!pc) | 2134 | if (!pc) |
2050 | return 0; | 2135 | return 0; |
@@ -2056,22 +2141,24 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2056 | unlock_page_cgroup(pc); | 2141 | unlock_page_cgroup(pc); |
2057 | } | 2142 | } |
2058 | 2143 | ||
2059 | if (unlikely(!mm && !mem)) | 2144 | if (unlikely(!mm)) |
2060 | mm = &init_mm; | 2145 | mm = &init_mm; |
2061 | 2146 | ||
2062 | if (page_is_file_cache(page)) | 2147 | if (page_is_file_cache(page)) |
2063 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 2148 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
2064 | MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); | 2149 | MEM_CGROUP_CHARGE_TYPE_CACHE); |
2065 | 2150 | ||
2066 | /* shmem */ | 2151 | /* shmem */ |
2067 | if (PageSwapCache(page)) { | 2152 | if (PageSwapCache(page)) { |
2153 | struct mem_cgroup *mem = NULL; | ||
2154 | |||
2068 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); | 2155 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); |
2069 | if (!ret) | 2156 | if (!ret) |
2070 | __mem_cgroup_commit_charge_swapin(page, mem, | 2157 | __mem_cgroup_commit_charge_swapin(page, mem, |
2071 | MEM_CGROUP_CHARGE_TYPE_SHMEM); | 2158 | MEM_CGROUP_CHARGE_TYPE_SHMEM); |
2072 | } else | 2159 | } else |
2073 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, | 2160 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, |
2074 | MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); | 2161 | MEM_CGROUP_CHARGE_TYPE_SHMEM); |
2075 | 2162 | ||
2076 | return ret; | 2163 | return ret; |
2077 | } | 2164 | } |
@@ -2107,7 +2194,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2107 | goto charge_cur_mm; | 2194 | goto charge_cur_mm; |
2108 | *ptr = mem; | 2195 | *ptr = mem; |
2109 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); | 2196 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); |
2110 | /* drop extra refcnt from tryget */ | ||
2111 | css_put(&mem->css); | 2197 | css_put(&mem->css); |
2112 | return ret; | 2198 | return ret; |
2113 | charge_cur_mm: | 2199 | charge_cur_mm: |
@@ -2238,7 +2324,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2238 | { | 2324 | { |
2239 | struct page_cgroup *pc; | 2325 | struct page_cgroup *pc; |
2240 | struct mem_cgroup *mem = NULL; | 2326 | struct mem_cgroup *mem = NULL; |
2241 | struct mem_cgroup_per_zone *mz; | ||
2242 | 2327 | ||
2243 | if (mem_cgroup_disabled()) | 2328 | if (mem_cgroup_disabled()) |
2244 | return NULL; | 2329 | return NULL; |
@@ -2278,10 +2363,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2278 | break; | 2363 | break; |
2279 | } | 2364 | } |
2280 | 2365 | ||
2281 | if (!mem_cgroup_is_root(mem)) | ||
2282 | __do_uncharge(mem, ctype); | ||
2283 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
2284 | mem_cgroup_swap_statistics(mem, true); | ||
2285 | mem_cgroup_charge_statistics(mem, pc, false); | 2366 | mem_cgroup_charge_statistics(mem, pc, false); |
2286 | 2367 | ||
2287 | ClearPageCgroupUsed(pc); | 2368 | ClearPageCgroupUsed(pc); |
@@ -2292,13 +2373,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2292 | * special functions. | 2373 | * special functions. |
2293 | */ | 2374 | */ |
2294 | 2375 | ||
2295 | mz = page_cgroup_zoneinfo(pc); | ||
2296 | unlock_page_cgroup(pc); | 2376 | unlock_page_cgroup(pc); |
2297 | 2377 | /* | |
2378 | * even after unlock, we have mem->res.usage here and this memcg | ||
2379 | * will never be freed. | ||
2380 | */ | ||
2298 | memcg_check_events(mem, page); | 2381 | memcg_check_events(mem, page); |
2299 | /* at swapout, this memcg will be accessed to record to swap */ | 2382 | if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { |
2300 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2383 | mem_cgroup_swap_statistics(mem, true); |
2301 | css_put(&mem->css); | 2384 | mem_cgroup_get(mem); |
2385 | } | ||
2386 | if (!mem_cgroup_is_root(mem)) | ||
2387 | __do_uncharge(mem, ctype); | ||
2302 | 2388 | ||
2303 | return mem; | 2389 | return mem; |
2304 | 2390 | ||
@@ -2385,13 +2471,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
2385 | 2471 | ||
2386 | memcg = __mem_cgroup_uncharge_common(page, ctype); | 2472 | memcg = __mem_cgroup_uncharge_common(page, ctype); |
2387 | 2473 | ||
2388 | /* record memcg information */ | 2474 | /* |
2389 | if (do_swap_account && swapout && memcg) { | 2475 | * record memcg information, if swapout && memcg != NULL, |
2476 | * mem_cgroup_get() was called in uncharge(). | ||
2477 | */ | ||
2478 | if (do_swap_account && swapout && memcg) | ||
2390 | swap_cgroup_record(ent, css_id(&memcg->css)); | 2479 | swap_cgroup_record(ent, css_id(&memcg->css)); |
2391 | mem_cgroup_get(memcg); | ||
2392 | } | ||
2393 | if (swapout && memcg) | ||
2394 | css_put(&memcg->css); | ||
2395 | } | 2480 | } |
2396 | #endif | 2481 | #endif |
2397 | 2482 | ||
@@ -2469,7 +2554,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
2469 | */ | 2554 | */ |
2470 | if (!mem_cgroup_is_root(to)) | 2555 | if (!mem_cgroup_is_root(to)) |
2471 | res_counter_uncharge(&to->res, PAGE_SIZE); | 2556 | res_counter_uncharge(&to->res, PAGE_SIZE); |
2472 | css_put(&to->css); | ||
2473 | } | 2557 | } |
2474 | return 0; | 2558 | return 0; |
2475 | } | 2559 | } |
@@ -2604,11 +2688,8 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, | |||
2604 | ClearPageCgroupMigration(pc); | 2688 | ClearPageCgroupMigration(pc); |
2605 | unlock_page_cgroup(pc); | 2689 | unlock_page_cgroup(pc); |
2606 | 2690 | ||
2607 | if (unused != oldpage) | ||
2608 | pc = lookup_page_cgroup(unused); | ||
2609 | __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); | 2691 | __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); |
2610 | 2692 | ||
2611 | pc = lookup_page_cgroup(used); | ||
2612 | /* | 2693 | /* |
2613 | * If a page is a file cache, radix-tree replacement is very atomic | 2694 | * If a page is a file cache, radix-tree replacement is very atomic |
2614 | * and we can skip this check. When it was an Anon page, its mapcount | 2695 | * and we can skip this check. When it was an Anon page, its mapcount |
@@ -2784,8 +2865,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
2784 | } | 2865 | } |
2785 | 2866 | ||
2786 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | 2867 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, |
2787 | gfp_t gfp_mask, int nid, | 2868 | gfp_t gfp_mask) |
2788 | int zid) | ||
2789 | { | 2869 | { |
2790 | unsigned long nr_reclaimed = 0; | 2870 | unsigned long nr_reclaimed = 0; |
2791 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | 2871 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; |
@@ -2797,7 +2877,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
2797 | if (order > 0) | 2877 | if (order > 0) |
2798 | return 0; | 2878 | return 0; |
2799 | 2879 | ||
2800 | mctz = soft_limit_tree_node_zone(nid, zid); | 2880 | mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); |
2801 | /* | 2881 | /* |
2802 | * This loop can run a while, specially if mem_cgroup's continuously | 2882 | * This loop can run a while, specially if mem_cgroup's continuously |
2803 | * keep exceeding their soft limit and putting the system under | 2883 | * keep exceeding their soft limit and putting the system under |
@@ -3752,8 +3832,6 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp, | |||
3752 | return 0; | 3832 | return 0; |
3753 | } | 3833 | } |
3754 | 3834 | ||
3755 | /* | ||
3756 | */ | ||
3757 | static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | 3835 | static int mem_cgroup_oom_control_write(struct cgroup *cgrp, |
3758 | struct cftype *cft, u64 val) | 3836 | struct cftype *cft, u64 val) |
3759 | { | 3837 | { |
@@ -4173,9 +4251,6 @@ static int mem_cgroup_do_precharge(unsigned long count) | |||
4173 | goto one_by_one; | 4251 | goto one_by_one; |
4174 | } | 4252 | } |
4175 | mc.precharge += count; | 4253 | mc.precharge += count; |
4176 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
4177 | WARN_ON_ONCE(count > INT_MAX); | ||
4178 | __css_get(&mem->css, (int)count); | ||
4179 | return ret; | 4254 | return ret; |
4180 | } | 4255 | } |
4181 | one_by_one: | 4256 | one_by_one: |
@@ -4393,11 +4468,13 @@ static int mem_cgroup_precharge_mc(struct mm_struct *mm) | |||
4393 | 4468 | ||
4394 | static void mem_cgroup_clear_mc(void) | 4469 | static void mem_cgroup_clear_mc(void) |
4395 | { | 4470 | { |
4471 | struct mem_cgroup *from = mc.from; | ||
4472 | struct mem_cgroup *to = mc.to; | ||
4473 | |||
4396 | /* we must uncharge all the leftover precharges from mc.to */ | 4474 | /* we must uncharge all the leftover precharges from mc.to */ |
4397 | if (mc.precharge) { | 4475 | if (mc.precharge) { |
4398 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); | 4476 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); |
4399 | mc.precharge = 0; | 4477 | mc.precharge = 0; |
4400 | memcg_oom_recover(mc.to); | ||
4401 | } | 4478 | } |
4402 | /* | 4479 | /* |
4403 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so | 4480 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so |
@@ -4406,11 +4483,9 @@ static void mem_cgroup_clear_mc(void) | |||
4406 | if (mc.moved_charge) { | 4483 | if (mc.moved_charge) { |
4407 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); | 4484 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); |
4408 | mc.moved_charge = 0; | 4485 | mc.moved_charge = 0; |
4409 | memcg_oom_recover(mc.from); | ||
4410 | } | 4486 | } |
4411 | /* we must fixup refcnts and charges */ | 4487 | /* we must fixup refcnts and charges */ |
4412 | if (mc.moved_swap) { | 4488 | if (mc.moved_swap) { |
4413 | WARN_ON_ONCE(mc.moved_swap > INT_MAX); | ||
4414 | /* uncharge swap account from the old cgroup */ | 4489 | /* uncharge swap account from the old cgroup */ |
4415 | if (!mem_cgroup_is_root(mc.from)) | 4490 | if (!mem_cgroup_is_root(mc.from)) |
4416 | res_counter_uncharge(&mc.from->memsw, | 4491 | res_counter_uncharge(&mc.from->memsw, |
@@ -4424,16 +4499,18 @@ static void mem_cgroup_clear_mc(void) | |||
4424 | */ | 4499 | */ |
4425 | res_counter_uncharge(&mc.to->res, | 4500 | res_counter_uncharge(&mc.to->res, |
4426 | PAGE_SIZE * mc.moved_swap); | 4501 | PAGE_SIZE * mc.moved_swap); |
4427 | VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags)); | ||
4428 | __css_put(&mc.to->css, mc.moved_swap); | ||
4429 | } | 4502 | } |
4430 | /* we've already done mem_cgroup_get(mc.to) */ | 4503 | /* we've already done mem_cgroup_get(mc.to) */ |
4431 | 4504 | ||
4432 | mc.moved_swap = 0; | 4505 | mc.moved_swap = 0; |
4433 | } | 4506 | } |
4507 | spin_lock(&mc.lock); | ||
4434 | mc.from = NULL; | 4508 | mc.from = NULL; |
4435 | mc.to = NULL; | 4509 | mc.to = NULL; |
4436 | mc.moving_task = NULL; | 4510 | mc.moving_task = NULL; |
4511 | spin_unlock(&mc.lock); | ||
4512 | memcg_oom_recover(from); | ||
4513 | memcg_oom_recover(to); | ||
4437 | wake_up_all(&mc.waitq); | 4514 | wake_up_all(&mc.waitq); |
4438 | } | 4515 | } |
4439 | 4516 | ||
@@ -4462,12 +4539,14 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
4462 | VM_BUG_ON(mc.moved_charge); | 4539 | VM_BUG_ON(mc.moved_charge); |
4463 | VM_BUG_ON(mc.moved_swap); | 4540 | VM_BUG_ON(mc.moved_swap); |
4464 | VM_BUG_ON(mc.moving_task); | 4541 | VM_BUG_ON(mc.moving_task); |
4542 | spin_lock(&mc.lock); | ||
4465 | mc.from = from; | 4543 | mc.from = from; |
4466 | mc.to = mem; | 4544 | mc.to = mem; |
4467 | mc.precharge = 0; | 4545 | mc.precharge = 0; |
4468 | mc.moved_charge = 0; | 4546 | mc.moved_charge = 0; |
4469 | mc.moved_swap = 0; | 4547 | mc.moved_swap = 0; |
4470 | mc.moving_task = current; | 4548 | mc.moving_task = current; |
4549 | spin_unlock(&mc.lock); | ||
4471 | 4550 | ||
4472 | ret = mem_cgroup_precharge_mc(mm); | 4551 | ret = mem_cgroup_precharge_mc(mm); |
4473 | if (ret) | 4552 | if (ret) |