aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c462
1 files changed, 267 insertions, 195 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20a8193a7af8..3eed583895a6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -47,10 +47,13 @@
47#include <linux/mm_inline.h> 47#include <linux/mm_inline.h>
48#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50#include <linux/oom.h>
50#include "internal.h" 51#include "internal.h"
51 52
52#include <asm/uaccess.h> 53#include <asm/uaccess.h>
53 54
55#include <trace/events/vmscan.h>
56
54struct cgroup_subsys mem_cgroup_subsys __read_mostly; 57struct cgroup_subsys mem_cgroup_subsys __read_mostly;
55#define MEM_CGROUP_RECLAIM_RETRIES 5 58#define MEM_CGROUP_RECLAIM_RETRIES 5
56struct mem_cgroup *root_mem_cgroup __read_mostly; 59struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -211,8 +214,6 @@ struct mem_cgroup {
211 */ 214 */
212 spinlock_t reclaim_param_lock; 215 spinlock_t reclaim_param_lock;
213 216
214 int prev_priority; /* for recording reclaim priority */
215
216 /* 217 /*
217 * While reclaiming in a hierarchy, we cache the last child we 218 * While reclaiming in a hierarchy, we cache the last child we
218 * reclaimed from. 219 * reclaimed from.
@@ -268,6 +269,7 @@ enum move_type {
268 269
269/* "mc" and its members are protected by cgroup_mutex */ 270/* "mc" and its members are protected by cgroup_mutex */
270static struct move_charge_struct { 271static struct move_charge_struct {
272 spinlock_t lock; /* for from, to, moving_task */
271 struct mem_cgroup *from; 273 struct mem_cgroup *from;
272 struct mem_cgroup *to; 274 struct mem_cgroup *to;
273 unsigned long precharge; 275 unsigned long precharge;
@@ -276,6 +278,7 @@ static struct move_charge_struct {
276 struct task_struct *moving_task; /* a task moving charges */ 278 struct task_struct *moving_task; /* a task moving charges */
277 wait_queue_head_t waitq; /* a waitq for other context */ 279 wait_queue_head_t waitq; /* a waitq for other context */
278} mc = { 280} mc = {
281 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
279 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 282 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
280}; 283};
281 284
@@ -836,12 +839,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
836{ 839{
837 int ret; 840 int ret;
838 struct mem_cgroup *curr = NULL; 841 struct mem_cgroup *curr = NULL;
842 struct task_struct *p;
839 843
840 task_lock(task); 844 p = find_lock_task_mm(task);
841 rcu_read_lock(); 845 if (!p)
842 curr = try_get_mem_cgroup_from_mm(task->mm); 846 return 0;
843 rcu_read_unlock(); 847 curr = try_get_mem_cgroup_from_mm(p->mm);
844 task_unlock(task); 848 task_unlock(p);
845 if (!curr) 849 if (!curr)
846 return 0; 850 return 0;
847 /* 851 /*
@@ -858,35 +862,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
858 return ret; 862 return ret;
859} 863}
860 864
861/*
862 * prev_priority control...this will be used in memory reclaim path.
863 */
864int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
865{
866 int prev_priority;
867
868 spin_lock(&mem->reclaim_param_lock);
869 prev_priority = mem->prev_priority;
870 spin_unlock(&mem->reclaim_param_lock);
871
872 return prev_priority;
873}
874
875void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
876{
877 spin_lock(&mem->reclaim_param_lock);
878 if (priority < mem->prev_priority)
879 mem->prev_priority = priority;
880 spin_unlock(&mem->reclaim_param_lock);
881}
882
883void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
884{
885 spin_lock(&mem->reclaim_param_lock);
886 mem->prev_priority = priority;
887 spin_unlock(&mem->reclaim_param_lock);
888}
889
890static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 865static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
891{ 866{
892 unsigned long active; 867 unsigned long active;
@@ -944,7 +919,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
944 struct zone *zone, 919 struct zone *zone,
945 enum lru_list lru) 920 enum lru_list lru)
946{ 921{
947 int nid = zone->zone_pgdat->node_id; 922 int nid = zone_to_nid(zone);
948 int zid = zone_idx(zone); 923 int zid = zone_idx(zone);
949 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 924 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
950 925
@@ -954,7 +929,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
954struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 929struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
955 struct zone *zone) 930 struct zone *zone)
956{ 931{
957 int nid = zone->zone_pgdat->node_id; 932 int nid = zone_to_nid(zone);
958 int zid = zone_idx(zone); 933 int zid = zone_idx(zone);
959 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 934 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
960 935
@@ -999,7 +974,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
999 LIST_HEAD(pc_list); 974 LIST_HEAD(pc_list);
1000 struct list_head *src; 975 struct list_head *src;
1001 struct page_cgroup *pc, *tmp; 976 struct page_cgroup *pc, *tmp;
1002 int nid = z->zone_pgdat->node_id; 977 int nid = zone_to_nid(z);
1003 int zid = zone_idx(z); 978 int zid = zone_idx(z);
1004 struct mem_cgroup_per_zone *mz; 979 struct mem_cgroup_per_zone *mz;
1005 int lru = LRU_FILE * file + active; 980 int lru = LRU_FILE * file + active;
@@ -1038,6 +1013,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1038 } 1013 }
1039 1014
1040 *scanned = scan; 1015 *scanned = scan;
1016
1017 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1018 0, 0, 0, mode);
1019
1041 return nr_taken; 1020 return nr_taken;
1042} 1021}
1043 1022
@@ -1072,6 +1051,47 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
1072 return swappiness; 1051 return swappiness;
1073} 1052}
1074 1053
1054/* A routine for testing mem is not under move_account */
1055
1056static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1057{
1058 struct mem_cgroup *from;
1059 struct mem_cgroup *to;
1060 bool ret = false;
1061 /*
1062 * Unlike task_move routines, we access mc.to, mc.from not under
1063 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1064 */
1065 spin_lock(&mc.lock);
1066 from = mc.from;
1067 to = mc.to;
1068 if (!from)
1069 goto unlock;
1070 if (from == mem || to == mem
1071 || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
1072 || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
1073 ret = true;
1074unlock:
1075 spin_unlock(&mc.lock);
1076 return ret;
1077}
1078
1079static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1080{
1081 if (mc.moving_task && current != mc.moving_task) {
1082 if (mem_cgroup_under_move(mem)) {
1083 DEFINE_WAIT(wait);
1084 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1085 /* moving charge context might have finished. */
1086 if (mc.moving_task)
1087 schedule();
1088 finish_wait(&mc.waitq, &wait);
1089 return true;
1090 }
1091 }
1092 return false;
1093}
1094
1075static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) 1095static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1076{ 1096{
1077 int *val = data; 1097 int *val = data;
@@ -1158,6 +1178,24 @@ static int mem_cgroup_count_children(struct mem_cgroup *mem)
1158} 1178}
1159 1179
1160/* 1180/*
1181 * Return the memory (and swap, if configured) limit for a memcg.
1182 */
1183u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1184{
1185 u64 limit;
1186 u64 memsw;
1187
1188 limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
1189 total_swap_pages;
1190 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1191 /*
1192 * If memsw is finite and limits the amount of swap space available
1193 * to this memcg, return that limit.
1194 */
1195 return min(limit, memsw);
1196}
1197
1198/*
1161 * Visit the first child (need not be the first child as per the ordering 1199 * Visit the first child (need not be the first child as per the ordering
1162 * of the cgroup list, since we track last_scanned_child) of @mem and use 1200 * of the cgroup list, since we track last_scanned_child) of @mem and use
1163 * that to reclaim free pages from. 1201 * that to reclaim free pages from.
@@ -1262,8 +1300,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1262 /* we use swappiness of local cgroup */ 1300 /* we use swappiness of local cgroup */
1263 if (check_soft) 1301 if (check_soft)
1264 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1302 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1265 noswap, get_swappiness(victim), zone, 1303 noswap, get_swappiness(victim), zone);
1266 zone->zone_pgdat->node_id);
1267 else 1304 else
1268 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1305 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1269 noswap, get_swappiness(victim)); 1306 noswap, get_swappiness(victim));
@@ -1370,7 +1407,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
1370 1407
1371static void memcg_oom_recover(struct mem_cgroup *mem) 1408static void memcg_oom_recover(struct mem_cgroup *mem)
1372{ 1409{
1373 if (atomic_read(&mem->oom_lock)) 1410 if (mem && atomic_read(&mem->oom_lock))
1374 memcg_wakeup_oom(mem); 1411 memcg_wakeup_oom(mem);
1375} 1412}
1376 1413
@@ -1582,16 +1619,83 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1582 return NOTIFY_OK; 1619 return NOTIFY_OK;
1583} 1620}
1584 1621
1622
1623/* See __mem_cgroup_try_charge() for details */
1624enum {
1625 CHARGE_OK, /* success */
1626 CHARGE_RETRY, /* need to retry but retry is not bad */
1627 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
1628 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
1629 CHARGE_OOM_DIE, /* the current is killed because of OOM */
1630};
1631
1632static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1633 int csize, bool oom_check)
1634{
1635 struct mem_cgroup *mem_over_limit;
1636 struct res_counter *fail_res;
1637 unsigned long flags = 0;
1638 int ret;
1639
1640 ret = res_counter_charge(&mem->res, csize, &fail_res);
1641
1642 if (likely(!ret)) {
1643 if (!do_swap_account)
1644 return CHARGE_OK;
1645 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1646 if (likely(!ret))
1647 return CHARGE_OK;
1648
1649 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1650 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1651 } else
1652 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1653
1654 if (csize > PAGE_SIZE) /* change csize and retry */
1655 return CHARGE_RETRY;
1656
1657 if (!(gfp_mask & __GFP_WAIT))
1658 return CHARGE_WOULDBLOCK;
1659
1660 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1661 gfp_mask, flags);
1662 /*
1663 * try_to_free_mem_cgroup_pages() might not give us a full
1664 * picture of reclaim. Some pages are reclaimed and might be
1665 * moved to swap cache or just unmapped from the cgroup.
1666 * Check the limit again to see if the reclaim reduced the
1667 * current usage of the cgroup before giving up
1668 */
1669 if (ret || mem_cgroup_check_under_limit(mem_over_limit))
1670 return CHARGE_RETRY;
1671
1672 /*
1673 * At task move, charge accounts can be doubly counted. So, it's
1674 * better to wait until the end of task_move if something is going on.
1675 */
1676 if (mem_cgroup_wait_acct_move(mem_over_limit))
1677 return CHARGE_RETRY;
1678
1679 /* If we don't need to call oom-killer at el, return immediately */
1680 if (!oom_check)
1681 return CHARGE_NOMEM;
1682 /* check OOM */
1683 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
1684 return CHARGE_OOM_DIE;
1685
1686 return CHARGE_RETRY;
1687}
1688
1585/* 1689/*
1586 * Unlike exported interface, "oom" parameter is added. if oom==true, 1690 * Unlike exported interface, "oom" parameter is added. if oom==true,
1587 * oom-killer can be invoked. 1691 * oom-killer can be invoked.
1588 */ 1692 */
1589static int __mem_cgroup_try_charge(struct mm_struct *mm, 1693static int __mem_cgroup_try_charge(struct mm_struct *mm,
1590 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) 1694 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
1591{ 1695{
1592 struct mem_cgroup *mem, *mem_over_limit; 1696 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1593 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1697 struct mem_cgroup *mem = NULL;
1594 struct res_counter *fail_res; 1698 int ret;
1595 int csize = CHARGE_SIZE; 1699 int csize = CHARGE_SIZE;
1596 1700
1597 /* 1701 /*
@@ -1609,126 +1713,108 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1609 * thread group leader migrates. It's possible that mm is not 1713 * thread group leader migrates. It's possible that mm is not
1610 * set, if so charge the init_mm (happens for pagecache usage). 1714 * set, if so charge the init_mm (happens for pagecache usage).
1611 */ 1715 */
1612 mem = *memcg; 1716 if (!*memcg && !mm)
1613 if (likely(!mem)) { 1717 goto bypass;
1614 mem = try_get_mem_cgroup_from_mm(mm); 1718again:
1615 *memcg = mem; 1719 if (*memcg) { /* css should be a valid one */
1616 } else { 1720 mem = *memcg;
1617 css_get(&mem->css); 1721 VM_BUG_ON(css_is_removed(&mem->css));
1618 } 1722 if (mem_cgroup_is_root(mem))
1619 if (unlikely(!mem)) 1723 goto done;
1620 return 0;
1621
1622 VM_BUG_ON(css_is_removed(&mem->css));
1623 if (mem_cgroup_is_root(mem))
1624 goto done;
1625
1626 while (1) {
1627 int ret = 0;
1628 unsigned long flags = 0;
1629
1630 if (consume_stock(mem)) 1724 if (consume_stock(mem))
1631 goto done; 1725 goto done;
1726 css_get(&mem->css);
1727 } else {
1728 struct task_struct *p;
1632 1729
1633 ret = res_counter_charge(&mem->res, csize, &fail_res); 1730 rcu_read_lock();
1634 if (likely(!ret)) { 1731 p = rcu_dereference(mm->owner);
1635 if (!do_swap_account) 1732 VM_BUG_ON(!p);
1636 break;
1637 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1638 if (likely(!ret))
1639 break;
1640 /* mem+swap counter fails */
1641 res_counter_uncharge(&mem->res, csize);
1642 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1643 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1644 memsw);
1645 } else
1646 /* mem counter fails */
1647 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1648 res);
1649
1650 /* reduce request size and retry */
1651 if (csize > PAGE_SIZE) {
1652 csize = PAGE_SIZE;
1653 continue;
1654 }
1655 if (!(gfp_mask & __GFP_WAIT))
1656 goto nomem;
1657
1658 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1659 gfp_mask, flags);
1660 if (ret)
1661 continue;
1662
1663 /* 1733 /*
1664 * try_to_free_mem_cgroup_pages() might not give us a full 1734 * because we don't have task_lock(), "p" can exit while
1665 * picture of reclaim. Some pages are reclaimed and might be 1735 * we're here. In that case, "mem" can point to root
1666 * moved to swap cache or just unmapped from the cgroup. 1736 * cgroup but never be NULL. (and task_struct itself is freed
1667 * Check the limit again to see if the reclaim reduced the 1737 * by RCU, cgroup itself is RCU safe.) Then, we have small
1668 * current usage of the cgroup before giving up 1738 * risk here to get wrong cgroup. But such kind of mis-account
1669 * 1739 * by race always happens because we don't have cgroup_mutex().
1740 * It's overkill and we allow that small race, here.
1670 */ 1741 */
1671 if (mem_cgroup_check_under_limit(mem_over_limit)) 1742 mem = mem_cgroup_from_task(p);
1672 continue; 1743 VM_BUG_ON(!mem);
1673 1744 if (mem_cgroup_is_root(mem)) {
1674 /* try to avoid oom while someone is moving charge */ 1745 rcu_read_unlock();
1675 if (mc.moving_task && current != mc.moving_task) { 1746 goto done;
1676 struct mem_cgroup *from, *to; 1747 }
1677 bool do_continue = false; 1748 if (consume_stock(mem)) {
1678 /* 1749 /*
1679 * There is a small race that "from" or "to" can be 1750 * It seems dagerous to access memcg without css_get().
1680 * freed by rmdir, so we use css_tryget(). 1751 * But considering how consume_stok works, it's not
1752 * necessary. If consume_stock success, some charges
1753 * from this memcg are cached on this cpu. So, we
1754 * don't need to call css_get()/css_tryget() before
1755 * calling consume_stock().
1681 */ 1756 */
1682 from = mc.from; 1757 rcu_read_unlock();
1683 to = mc.to; 1758 goto done;
1684 if (from && css_tryget(&from->css)) { 1759 }
1685 if (mem_over_limit->use_hierarchy) 1760 /* after here, we may be blocked. we need to get refcnt */
1686 do_continue = css_is_ancestor( 1761 if (!css_tryget(&mem->css)) {
1687 &from->css, 1762 rcu_read_unlock();
1688 &mem_over_limit->css); 1763 goto again;
1689 else 1764 }
1690 do_continue = (from == mem_over_limit); 1765 rcu_read_unlock();
1691 css_put(&from->css); 1766 }
1692 } 1767
1693 if (!do_continue && to && css_tryget(&to->css)) { 1768 do {
1694 if (mem_over_limit->use_hierarchy) 1769 bool oom_check;
1695 do_continue = css_is_ancestor( 1770
1696 &to->css, 1771 /* If killed, bypass charge */
1697 &mem_over_limit->css); 1772 if (fatal_signal_pending(current)) {
1698 else 1773 css_put(&mem->css);
1699 do_continue = (to == mem_over_limit); 1774 goto bypass;
1700 css_put(&to->css); 1775 }
1701 } 1776
1702 if (do_continue) { 1777 oom_check = false;
1703 DEFINE_WAIT(wait); 1778 if (oom && !nr_oom_retries) {
1704 prepare_to_wait(&mc.waitq, &wait, 1779 oom_check = true;
1705 TASK_INTERRUPTIBLE); 1780 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1706 /* moving charge context might have finished. */
1707 if (mc.moving_task)
1708 schedule();
1709 finish_wait(&mc.waitq, &wait);
1710 continue;
1711 }
1712 } 1781 }
1713 1782
1714 if (!nr_retries--) { 1783 ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
1715 if (!oom) 1784
1785 switch (ret) {
1786 case CHARGE_OK:
1787 break;
1788 case CHARGE_RETRY: /* not in OOM situation but retry */
1789 csize = PAGE_SIZE;
1790 css_put(&mem->css);
1791 mem = NULL;
1792 goto again;
1793 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
1794 css_put(&mem->css);
1795 goto nomem;
1796 case CHARGE_NOMEM: /* OOM routine works */
1797 if (!oom) {
1798 css_put(&mem->css);
1716 goto nomem; 1799 goto nomem;
1717 if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
1718 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1719 continue;
1720 } 1800 }
1721 /* When we reach here, current task is dying .*/ 1801 /* If oom, we never return -ENOMEM */
1802 nr_oom_retries--;
1803 break;
1804 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
1722 css_put(&mem->css); 1805 css_put(&mem->css);
1723 goto bypass; 1806 goto bypass;
1724 } 1807 }
1725 } 1808 } while (ret != CHARGE_OK);
1809
1726 if (csize > PAGE_SIZE) 1810 if (csize > PAGE_SIZE)
1727 refill_stock(mem, csize - PAGE_SIZE); 1811 refill_stock(mem, csize - PAGE_SIZE);
1812 css_put(&mem->css);
1728done: 1813done:
1814 *memcg = mem;
1729 return 0; 1815 return 0;
1730nomem: 1816nomem:
1731 css_put(&mem->css); 1817 *memcg = NULL;
1732 return -ENOMEM; 1818 return -ENOMEM;
1733bypass: 1819bypass:
1734 *memcg = NULL; 1820 *memcg = NULL;
@@ -1747,11 +1833,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
1747 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 1833 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
1748 if (do_swap_account) 1834 if (do_swap_account)
1749 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); 1835 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
1750 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
1751 WARN_ON_ONCE(count > INT_MAX);
1752 __css_put(&mem->css, (int)count);
1753 } 1836 }
1754 /* we don't need css_put for root */
1755} 1837}
1756 1838
1757static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 1839static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
@@ -1979,10 +2061,9 @@ out:
1979 * < 0 if the cgroup is over its limit 2061 * < 0 if the cgroup is over its limit
1980 */ 2062 */
1981static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 2063static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1982 gfp_t gfp_mask, enum charge_type ctype, 2064 gfp_t gfp_mask, enum charge_type ctype)
1983 struct mem_cgroup *memcg)
1984{ 2065{
1985 struct mem_cgroup *mem; 2066 struct mem_cgroup *mem = NULL;
1986 struct page_cgroup *pc; 2067 struct page_cgroup *pc;
1987 int ret; 2068 int ret;
1988 2069
@@ -1992,7 +2073,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1992 return 0; 2073 return 0;
1993 prefetchw(pc); 2074 prefetchw(pc);
1994 2075
1995 mem = memcg;
1996 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 2076 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1997 if (ret || !mem) 2077 if (ret || !mem)
1998 return ret; 2078 return ret;
@@ -2020,7 +2100,7 @@ int mem_cgroup_newpage_charge(struct page *page,
2020 if (unlikely(!mm)) 2100 if (unlikely(!mm))
2021 mm = &init_mm; 2101 mm = &init_mm;
2022 return mem_cgroup_charge_common(page, mm, gfp_mask, 2102 return mem_cgroup_charge_common(page, mm, gfp_mask,
2023 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 2103 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2024} 2104}
2025 2105
2026static void 2106static void
@@ -2030,7 +2110,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2030int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2110int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2031 gfp_t gfp_mask) 2111 gfp_t gfp_mask)
2032{ 2112{
2033 struct mem_cgroup *mem = NULL;
2034 int ret; 2113 int ret;
2035 2114
2036 if (mem_cgroup_disabled()) 2115 if (mem_cgroup_disabled())
@@ -2051,7 +2130,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2051 if (!(gfp_mask & __GFP_WAIT)) { 2130 if (!(gfp_mask & __GFP_WAIT)) {
2052 struct page_cgroup *pc; 2131 struct page_cgroup *pc;
2053 2132
2054
2055 pc = lookup_page_cgroup(page); 2133 pc = lookup_page_cgroup(page);
2056 if (!pc) 2134 if (!pc)
2057 return 0; 2135 return 0;
@@ -2063,22 +2141,24 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2063 unlock_page_cgroup(pc); 2141 unlock_page_cgroup(pc);
2064 } 2142 }
2065 2143
2066 if (unlikely(!mm && !mem)) 2144 if (unlikely(!mm))
2067 mm = &init_mm; 2145 mm = &init_mm;
2068 2146
2069 if (page_is_file_cache(page)) 2147 if (page_is_file_cache(page))
2070 return mem_cgroup_charge_common(page, mm, gfp_mask, 2148 return mem_cgroup_charge_common(page, mm, gfp_mask,
2071 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 2149 MEM_CGROUP_CHARGE_TYPE_CACHE);
2072 2150
2073 /* shmem */ 2151 /* shmem */
2074 if (PageSwapCache(page)) { 2152 if (PageSwapCache(page)) {
2153 struct mem_cgroup *mem = NULL;
2154
2075 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2155 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2076 if (!ret) 2156 if (!ret)
2077 __mem_cgroup_commit_charge_swapin(page, mem, 2157 __mem_cgroup_commit_charge_swapin(page, mem,
2078 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2158 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2079 } else 2159 } else
2080 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 2160 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2081 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 2161 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2082 2162
2083 return ret; 2163 return ret;
2084} 2164}
@@ -2114,7 +2194,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2114 goto charge_cur_mm; 2194 goto charge_cur_mm;
2115 *ptr = mem; 2195 *ptr = mem;
2116 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 2196 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
2117 /* drop extra refcnt from tryget */
2118 css_put(&mem->css); 2197 css_put(&mem->css);
2119 return ret; 2198 return ret;
2120charge_cur_mm: 2199charge_cur_mm:
@@ -2245,7 +2324,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2245{ 2324{
2246 struct page_cgroup *pc; 2325 struct page_cgroup *pc;
2247 struct mem_cgroup *mem = NULL; 2326 struct mem_cgroup *mem = NULL;
2248 struct mem_cgroup_per_zone *mz;
2249 2327
2250 if (mem_cgroup_disabled()) 2328 if (mem_cgroup_disabled())
2251 return NULL; 2329 return NULL;
@@ -2285,10 +2363,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2285 break; 2363 break;
2286 } 2364 }
2287 2365
2288 if (!mem_cgroup_is_root(mem))
2289 __do_uncharge(mem, ctype);
2290 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2291 mem_cgroup_swap_statistics(mem, true);
2292 mem_cgroup_charge_statistics(mem, pc, false); 2366 mem_cgroup_charge_statistics(mem, pc, false);
2293 2367
2294 ClearPageCgroupUsed(pc); 2368 ClearPageCgroupUsed(pc);
@@ -2299,13 +2373,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2299 * special functions. 2373 * special functions.
2300 */ 2374 */
2301 2375
2302 mz = page_cgroup_zoneinfo(pc);
2303 unlock_page_cgroup(pc); 2376 unlock_page_cgroup(pc);
2304 2377 /*
2378 * even after unlock, we have mem->res.usage here and this memcg
2379 * will never be freed.
2380 */
2305 memcg_check_events(mem, page); 2381 memcg_check_events(mem, page);
2306 /* at swapout, this memcg will be accessed to record to swap */ 2382 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
2307 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2383 mem_cgroup_swap_statistics(mem, true);
2308 css_put(&mem->css); 2384 mem_cgroup_get(mem);
2385 }
2386 if (!mem_cgroup_is_root(mem))
2387 __do_uncharge(mem, ctype);
2309 2388
2310 return mem; 2389 return mem;
2311 2390
@@ -2392,13 +2471,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
2392 2471
2393 memcg = __mem_cgroup_uncharge_common(page, ctype); 2472 memcg = __mem_cgroup_uncharge_common(page, ctype);
2394 2473
2395 /* record memcg information */ 2474 /*
2396 if (do_swap_account && swapout && memcg) { 2475 * record memcg information, if swapout && memcg != NULL,
2476 * mem_cgroup_get() was called in uncharge().
2477 */
2478 if (do_swap_account && swapout && memcg)
2397 swap_cgroup_record(ent, css_id(&memcg->css)); 2479 swap_cgroup_record(ent, css_id(&memcg->css));
2398 mem_cgroup_get(memcg);
2399 }
2400 if (swapout && memcg)
2401 css_put(&memcg->css);
2402} 2480}
2403#endif 2481#endif
2404 2482
@@ -2476,7 +2554,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
2476 */ 2554 */
2477 if (!mem_cgroup_is_root(to)) 2555 if (!mem_cgroup_is_root(to))
2478 res_counter_uncharge(&to->res, PAGE_SIZE); 2556 res_counter_uncharge(&to->res, PAGE_SIZE);
2479 css_put(&to->css);
2480 } 2557 }
2481 return 0; 2558 return 0;
2482 } 2559 }
@@ -2611,11 +2688,8 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
2611 ClearPageCgroupMigration(pc); 2688 ClearPageCgroupMigration(pc);
2612 unlock_page_cgroup(pc); 2689 unlock_page_cgroup(pc);
2613 2690
2614 if (unused != oldpage)
2615 pc = lookup_page_cgroup(unused);
2616 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 2691 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
2617 2692
2618 pc = lookup_page_cgroup(used);
2619 /* 2693 /*
2620 * If a page is a file cache, radix-tree replacement is very atomic 2694 * If a page is a file cache, radix-tree replacement is very atomic
2621 * and we can skip this check. When it was an Anon page, its mapcount 2695 * and we can skip this check. When it was an Anon page, its mapcount
@@ -2791,8 +2865,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2791} 2865}
2792 2866
2793unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 2867unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2794 gfp_t gfp_mask, int nid, 2868 gfp_t gfp_mask)
2795 int zid)
2796{ 2869{
2797 unsigned long nr_reclaimed = 0; 2870 unsigned long nr_reclaimed = 0;
2798 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 2871 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -2804,7 +2877,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2804 if (order > 0) 2877 if (order > 0)
2805 return 0; 2878 return 0;
2806 2879
2807 mctz = soft_limit_tree_node_zone(nid, zid); 2880 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
2808 /* 2881 /*
2809 * This loop can run a while, specially if mem_cgroup's continuously 2882 * This loop can run a while, specially if mem_cgroup's continuously
2810 * keep exceeding their soft limit and putting the system under 2883 * keep exceeding their soft limit and putting the system under
@@ -3759,8 +3832,6 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
3759 return 0; 3832 return 0;
3760} 3833}
3761 3834
3762/*
3763 */
3764static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 3835static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
3765 struct cftype *cft, u64 val) 3836 struct cftype *cft, u64 val)
3766{ 3837{
@@ -4180,9 +4251,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
4180 goto one_by_one; 4251 goto one_by_one;
4181 } 4252 }
4182 mc.precharge += count; 4253 mc.precharge += count;
4183 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
4184 WARN_ON_ONCE(count > INT_MAX);
4185 __css_get(&mem->css, (int)count);
4186 return ret; 4254 return ret;
4187 } 4255 }
4188one_by_one: 4256one_by_one:
@@ -4400,11 +4468,13 @@ static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4400 4468
4401static void mem_cgroup_clear_mc(void) 4469static void mem_cgroup_clear_mc(void)
4402{ 4470{
4471 struct mem_cgroup *from = mc.from;
4472 struct mem_cgroup *to = mc.to;
4473
4403 /* we must uncharge all the leftover precharges from mc.to */ 4474 /* we must uncharge all the leftover precharges from mc.to */
4404 if (mc.precharge) { 4475 if (mc.precharge) {
4405 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 4476 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4406 mc.precharge = 0; 4477 mc.precharge = 0;
4407 memcg_oom_recover(mc.to);
4408 } 4478 }
4409 /* 4479 /*
4410 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4480 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@@ -4413,11 +4483,9 @@ static void mem_cgroup_clear_mc(void)
4413 if (mc.moved_charge) { 4483 if (mc.moved_charge) {
4414 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 4484 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4415 mc.moved_charge = 0; 4485 mc.moved_charge = 0;
4416 memcg_oom_recover(mc.from);
4417 } 4486 }
4418 /* we must fixup refcnts and charges */ 4487 /* we must fixup refcnts and charges */
4419 if (mc.moved_swap) { 4488 if (mc.moved_swap) {
4420 WARN_ON_ONCE(mc.moved_swap > INT_MAX);
4421 /* uncharge swap account from the old cgroup */ 4489 /* uncharge swap account from the old cgroup */
4422 if (!mem_cgroup_is_root(mc.from)) 4490 if (!mem_cgroup_is_root(mc.from))
4423 res_counter_uncharge(&mc.from->memsw, 4491 res_counter_uncharge(&mc.from->memsw,
@@ -4431,16 +4499,18 @@ static void mem_cgroup_clear_mc(void)
4431 */ 4499 */
4432 res_counter_uncharge(&mc.to->res, 4500 res_counter_uncharge(&mc.to->res,
4433 PAGE_SIZE * mc.moved_swap); 4501 PAGE_SIZE * mc.moved_swap);
4434 VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
4435 __css_put(&mc.to->css, mc.moved_swap);
4436 } 4502 }
4437 /* we've already done mem_cgroup_get(mc.to) */ 4503 /* we've already done mem_cgroup_get(mc.to) */
4438 4504
4439 mc.moved_swap = 0; 4505 mc.moved_swap = 0;
4440 } 4506 }
4507 spin_lock(&mc.lock);
4441 mc.from = NULL; 4508 mc.from = NULL;
4442 mc.to = NULL; 4509 mc.to = NULL;
4443 mc.moving_task = NULL; 4510 mc.moving_task = NULL;
4511 spin_unlock(&mc.lock);
4512 memcg_oom_recover(from);
4513 memcg_oom_recover(to);
4444 wake_up_all(&mc.waitq); 4514 wake_up_all(&mc.waitq);
4445} 4515}
4446 4516
@@ -4469,12 +4539,14 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4469 VM_BUG_ON(mc.moved_charge); 4539 VM_BUG_ON(mc.moved_charge);
4470 VM_BUG_ON(mc.moved_swap); 4540 VM_BUG_ON(mc.moved_swap);
4471 VM_BUG_ON(mc.moving_task); 4541 VM_BUG_ON(mc.moving_task);
4542 spin_lock(&mc.lock);
4472 mc.from = from; 4543 mc.from = from;
4473 mc.to = mem; 4544 mc.to = mem;
4474 mc.precharge = 0; 4545 mc.precharge = 0;
4475 mc.moved_charge = 0; 4546 mc.moved_charge = 0;
4476 mc.moved_swap = 0; 4547 mc.moved_swap = 0;
4477 mc.moving_task = current; 4548 mc.moving_task = current;
4549 spin_unlock(&mc.lock);
4478 4550
4479 ret = mem_cgroup_precharge_mc(mm); 4551 ret = mem_cgroup_precharge_mc(mm);
4480 if (ret) 4552 if (ret)