aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2010-12-10 09:19:18 -0500
committerJiri Kosina <jkosina@suse.cz>2010-12-10 09:19:18 -0500
commit2ade0c1d9d93b7642212657ef76f4a1e30233711 (patch)
tree63bc720c0ffe5f4760cac4ed617b9870b050175e /mm/memcontrol.c
parent504499f22c08a03e2e19dc88d31aa0ecd2ac815e (diff)
parent6313e3c21743cc88bb5bd8aa72948ee1e83937b6 (diff)
Merge branch 'master' into upstream
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c488
1 files changed, 351 insertions, 137 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9be3cf8a5da4..7a22b4129211 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -61,7 +61,14 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
61#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 61#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
62/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 62/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
63int do_swap_account __read_mostly; 63int do_swap_account __read_mostly;
64static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 64
65/* for remember boot option*/
66#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
67static int really_do_swap_account __initdata = 1;
68#else
69static int really_do_swap_account __initdata = 0;
70#endif
71
65#else 72#else
66#define do_swap_account (0) 73#define do_swap_account (0)
67#endif 74#endif
@@ -89,7 +96,10 @@ enum mem_cgroup_stat_index {
89 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 96 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
90 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 97 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
91 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 98 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
92 MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ 99 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
100 /* incremented at every pagein/pageout */
101 MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
102 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
93 103
94 MEM_CGROUP_STAT_NSTATS, 104 MEM_CGROUP_STAT_NSTATS,
95}; 105};
@@ -254,6 +264,12 @@ struct mem_cgroup {
254 * percpu counter. 264 * percpu counter.
255 */ 265 */
256 struct mem_cgroup_stat_cpu *stat; 266 struct mem_cgroup_stat_cpu *stat;
267 /*
268 * used when a cpu is offlined or other synchronizations
269 * See mem_cgroup_read_stat().
270 */
271 struct mem_cgroup_stat_cpu nocpu_base;
272 spinlock_t pcp_counter_lock;
257}; 273};
258 274
259/* Stuffs for move charges at task migration. */ 275/* Stuffs for move charges at task migration. */
@@ -269,13 +285,14 @@ enum move_type {
269 285
270/* "mc" and its members are protected by cgroup_mutex */ 286/* "mc" and its members are protected by cgroup_mutex */
271static struct move_charge_struct { 287static struct move_charge_struct {
272 spinlock_t lock; /* for from, to, moving_task */ 288 spinlock_t lock; /* for from, to */
273 struct mem_cgroup *from; 289 struct mem_cgroup *from;
274 struct mem_cgroup *to; 290 struct mem_cgroup *to;
275 unsigned long precharge; 291 unsigned long precharge;
276 unsigned long moved_charge; 292 unsigned long moved_charge;
277 unsigned long moved_swap; 293 unsigned long moved_swap;
278 struct task_struct *moving_task; /* a task moving charges */ 294 struct task_struct *moving_task; /* a task moving charges */
295 struct mm_struct *mm;
279 wait_queue_head_t waitq; /* a waitq for other context */ 296 wait_queue_head_t waitq; /* a waitq for other context */
280} mc = { 297} mc = {
281 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 298 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
@@ -530,14 +547,40 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
530 return mz; 547 return mz;
531} 548}
532 549
550/*
551 * Implementation Note: reading percpu statistics for memcg.
552 *
553 * Both of vmstat[] and percpu_counter has threshold and do periodic
554 * synchronization to implement "quick" read. There are trade-off between
555 * reading cost and precision of value. Then, we may have a chance to implement
556 * a periodic synchronizion of counter in memcg's counter.
557 *
558 * But this _read() function is used for user interface now. The user accounts
559 * memory usage by memory cgroup and he _always_ requires exact value because
560 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
561 * have to visit all online cpus and make sum. So, for now, unnecessary
562 * synchronization is not implemented. (just implemented for cpu hotplug)
563 *
564 * If there are kernel internal actions which can make use of some not-exact
565 * value, and reading all cpu value can be performance bottleneck in some
566 * common workload, threashold and synchonization as vmstat[] should be
567 * implemented.
568 */
533static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, 569static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
534 enum mem_cgroup_stat_index idx) 570 enum mem_cgroup_stat_index idx)
535{ 571{
536 int cpu; 572 int cpu;
537 s64 val = 0; 573 s64 val = 0;
538 574
539 for_each_possible_cpu(cpu) 575 get_online_cpus();
576 for_each_online_cpu(cpu)
540 val += per_cpu(mem->stat->count[idx], cpu); 577 val += per_cpu(mem->stat->count[idx], cpu);
578#ifdef CONFIG_HOTPLUG_CPU
579 spin_lock(&mem->pcp_counter_lock);
580 val += mem->nocpu_base.count[idx];
581 spin_unlock(&mem->pcp_counter_lock);
582#endif
583 put_online_cpus();
541 return val; 584 return val;
542} 585}
543 586
@@ -659,40 +702,83 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
659 return mem; 702 return mem;
660} 703}
661 704
662/* 705/* The caller has to guarantee "mem" exists before calling this */
663 * Call callback function against all cgroup under hierarchy tree. 706static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
664 */
665static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
666 int (*func)(struct mem_cgroup *, void *))
667{ 707{
668 int found, ret, nextid;
669 struct cgroup_subsys_state *css; 708 struct cgroup_subsys_state *css;
670 struct mem_cgroup *mem; 709 int found;
671
672 if (!root->use_hierarchy)
673 return (*func)(root, data);
674 710
675 nextid = 1; 711 if (!mem) /* ROOT cgroup has the smallest ID */
676 do { 712 return root_mem_cgroup; /*css_put/get against root is ignored*/
677 ret = 0; 713 if (!mem->use_hierarchy) {
714 if (css_tryget(&mem->css))
715 return mem;
716 return NULL;
717 }
718 rcu_read_lock();
719 /*
720 * searching a memory cgroup which has the smallest ID under given
721 * ROOT cgroup. (ID >= 1)
722 */
723 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
724 if (css && css_tryget(css))
725 mem = container_of(css, struct mem_cgroup, css);
726 else
678 mem = NULL; 727 mem = NULL;
728 rcu_read_unlock();
729 return mem;
730}
731
732static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
733 struct mem_cgroup *root,
734 bool cond)
735{
736 int nextid = css_id(&iter->css) + 1;
737 int found;
738 int hierarchy_used;
739 struct cgroup_subsys_state *css;
740
741 hierarchy_used = iter->use_hierarchy;
742
743 css_put(&iter->css);
744 /* If no ROOT, walk all, ignore hierarchy */
745 if (!cond || (root && !hierarchy_used))
746 return NULL;
747
748 if (!root)
749 root = root_mem_cgroup;
679 750
751 do {
752 iter = NULL;
680 rcu_read_lock(); 753 rcu_read_lock();
681 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, 754
682 &found); 755 css = css_get_next(&mem_cgroup_subsys, nextid,
756 &root->css, &found);
683 if (css && css_tryget(css)) 757 if (css && css_tryget(css))
684 mem = container_of(css, struct mem_cgroup, css); 758 iter = container_of(css, struct mem_cgroup, css);
685 rcu_read_unlock(); 759 rcu_read_unlock();
686 760 /* If css is NULL, no more cgroups will be found */
687 if (mem) {
688 ret = (*func)(mem, data);
689 css_put(&mem->css);
690 }
691 nextid = found + 1; 761 nextid = found + 1;
692 } while (!ret && css); 762 } while (css && !iter);
693 763
694 return ret; 764 return iter;
695} 765}
766/*
767 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
768 * be careful that "break" loop is not allowed. We have reference count.
769 * Instead of that modify "cond" to be false and "continue" to exit the loop.
770 */
771#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
772 for (iter = mem_cgroup_start_loop(root);\
773 iter != NULL;\
774 iter = mem_cgroup_get_next(iter, root, cond))
775
776#define for_each_mem_cgroup_tree(iter, root) \
777 for_each_mem_cgroup_tree_cond(iter, root, true)
778
779#define for_each_mem_cgroup_all(iter) \
780 for_each_mem_cgroup_tree_cond(iter, NULL, true)
781
696 782
697static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 783static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
698{ 784{
@@ -1051,7 +1137,52 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
1051 return swappiness; 1137 return swappiness;
1052} 1138}
1053 1139
1054/* A routine for testing mem is not under move_account */ 1140static void mem_cgroup_start_move(struct mem_cgroup *mem)
1141{
1142 int cpu;
1143
1144 get_online_cpus();
1145 spin_lock(&mem->pcp_counter_lock);
1146 for_each_online_cpu(cpu)
1147 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1148 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1149 spin_unlock(&mem->pcp_counter_lock);
1150 put_online_cpus();
1151
1152 synchronize_rcu();
1153}
1154
1155static void mem_cgroup_end_move(struct mem_cgroup *mem)
1156{
1157 int cpu;
1158
1159 if (!mem)
1160 return;
1161 get_online_cpus();
1162 spin_lock(&mem->pcp_counter_lock);
1163 for_each_online_cpu(cpu)
1164 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1165 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1166 spin_unlock(&mem->pcp_counter_lock);
1167 put_online_cpus();
1168}
1169/*
1170 * 2 routines for checking "mem" is under move_account() or not.
1171 *
1172 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
1173 * for avoiding race in accounting. If true,
1174 * pc->mem_cgroup may be overwritten.
1175 *
1176 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1177 * under hierarchy of moving cgroups. This is for
1178 * waiting at hith-memory prressure caused by "move".
1179 */
1180
1181static bool mem_cgroup_stealed(struct mem_cgroup *mem)
1182{
1183 VM_BUG_ON(!rcu_read_lock_held());
1184 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1185}
1055 1186
1056static bool mem_cgroup_under_move(struct mem_cgroup *mem) 1187static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1057{ 1188{
@@ -1092,13 +1223,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1092 return false; 1223 return false;
1093} 1224}
1094 1225
1095static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1096{
1097 int *val = data;
1098 (*val)++;
1099 return 0;
1100}
1101
1102/** 1226/**
1103 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1227 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1104 * @memcg: The memory cgroup that went over limit 1228 * @memcg: The memory cgroup that went over limit
@@ -1173,7 +1297,10 @@ done:
1173static int mem_cgroup_count_children(struct mem_cgroup *mem) 1297static int mem_cgroup_count_children(struct mem_cgroup *mem)
1174{ 1298{
1175 int num = 0; 1299 int num = 0;
1176 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); 1300 struct mem_cgroup *iter;
1301
1302 for_each_mem_cgroup_tree(iter, mem)
1303 num++;
1177 return num; 1304 return num;
1178} 1305}
1179 1306
@@ -1322,49 +1449,39 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1322 return total; 1449 return total;
1323} 1450}
1324 1451
1325static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1326{
1327 int *val = (int *)data;
1328 int x;
1329 /*
1330 * Logically, we can stop scanning immediately when we find
1331 * a memcg is already locked. But condidering unlock ops and
1332 * creation/removal of memcg, scan-all is simple operation.
1333 */
1334 x = atomic_inc_return(&mem->oom_lock);
1335 *val = max(x, *val);
1336 return 0;
1337}
1338/* 1452/*
1339 * Check OOM-Killer is already running under our hierarchy. 1453 * Check OOM-Killer is already running under our hierarchy.
1340 * If someone is running, return false. 1454 * If someone is running, return false.
1341 */ 1455 */
1342static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1456static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1343{ 1457{
1344 int lock_count = 0; 1458 int x, lock_count = 0;
1459 struct mem_cgroup *iter;
1345 1460
1346 mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); 1461 for_each_mem_cgroup_tree(iter, mem) {
1462 x = atomic_inc_return(&iter->oom_lock);
1463 lock_count = max(x, lock_count);
1464 }
1347 1465
1348 if (lock_count == 1) 1466 if (lock_count == 1)
1349 return true; 1467 return true;
1350 return false; 1468 return false;
1351} 1469}
1352 1470
1353static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) 1471static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1354{ 1472{
1473 struct mem_cgroup *iter;
1474
1355 /* 1475 /*
1356 * When a new child is created while the hierarchy is under oom, 1476 * When a new child is created while the hierarchy is under oom,
1357 * mem_cgroup_oom_lock() may not be called. We have to use 1477 * mem_cgroup_oom_lock() may not be called. We have to use
1358 * atomic_add_unless() here. 1478 * atomic_add_unless() here.
1359 */ 1479 */
1360 atomic_add_unless(&mem->oom_lock, -1, 0); 1480 for_each_mem_cgroup_tree(iter, mem)
1481 atomic_add_unless(&iter->oom_lock, -1, 0);
1361 return 0; 1482 return 0;
1362} 1483}
1363 1484
1364static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1365{
1366 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
1367}
1368 1485
1369static DEFINE_MUTEX(memcg_oom_mutex); 1486static DEFINE_MUTEX(memcg_oom_mutex);
1370static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1487static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1462,34 +1579,73 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1462/* 1579/*
1463 * Currently used to update mapped file statistics, but the routine can be 1580 * Currently used to update mapped file statistics, but the routine can be
1464 * generalized to update other statistics as well. 1581 * generalized to update other statistics as well.
1582 *
1583 * Notes: Race condition
1584 *
1585 * We usually use page_cgroup_lock() for accessing page_cgroup member but
1586 * it tends to be costly. But considering some conditions, we doesn't need
1587 * to do so _always_.
1588 *
1589 * Considering "charge", lock_page_cgroup() is not required because all
1590 * file-stat operations happen after a page is attached to radix-tree. There
1591 * are no race with "charge".
1592 *
1593 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
1594 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
1595 * if there are race with "uncharge". Statistics itself is properly handled
1596 * by flags.
1597 *
1598 * Considering "move", this is an only case we see a race. To make the race
1599 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
1600 * possibility of race condition. If there is, we take a lock.
1465 */ 1601 */
1466void mem_cgroup_update_file_mapped(struct page *page, int val) 1602
1603static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
1467{ 1604{
1468 struct mem_cgroup *mem; 1605 struct mem_cgroup *mem;
1469 struct page_cgroup *pc; 1606 struct page_cgroup *pc = lookup_page_cgroup(page);
1607 bool need_unlock = false;
1470 1608
1471 pc = lookup_page_cgroup(page);
1472 if (unlikely(!pc)) 1609 if (unlikely(!pc))
1473 return; 1610 return;
1474 1611
1475 lock_page_cgroup(pc); 1612 rcu_read_lock();
1476 mem = pc->mem_cgroup; 1613 mem = pc->mem_cgroup;
1477 if (!mem || !PageCgroupUsed(pc)) 1614 if (unlikely(!mem || !PageCgroupUsed(pc)))
1478 goto done; 1615 goto out;
1616 /* pc->mem_cgroup is unstable ? */
1617 if (unlikely(mem_cgroup_stealed(mem))) {
1618 /* take a lock against to access pc->mem_cgroup */
1619 lock_page_cgroup(pc);
1620 need_unlock = true;
1621 mem = pc->mem_cgroup;
1622 if (!mem || !PageCgroupUsed(pc))
1623 goto out;
1624 }
1479 1625
1480 /* 1626 this_cpu_add(mem->stat->count[idx], val);
1481 * Preemption is already disabled. We can use __this_cpu_xxx 1627
1482 */ 1628 switch (idx) {
1483 if (val > 0) { 1629 case MEM_CGROUP_STAT_FILE_MAPPED:
1484 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1630 if (val > 0)
1485 SetPageCgroupFileMapped(pc); 1631 SetPageCgroupFileMapped(pc);
1486 } else { 1632 else if (!page_mapped(page))
1487 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1633 ClearPageCgroupFileMapped(pc);
1488 ClearPageCgroupFileMapped(pc); 1634 break;
1635 default:
1636 BUG();
1489 } 1637 }
1490 1638
1491done: 1639out:
1492 unlock_page_cgroup(pc); 1640 if (unlikely(need_unlock))
1641 unlock_page_cgroup(pc);
1642 rcu_read_unlock();
1643 return;
1644}
1645
1646void mem_cgroup_update_file_mapped(struct page *page, int val)
1647{
1648 mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
1493} 1649}
1494 1650
1495/* 1651/*
@@ -1605,15 +1761,55 @@ static void drain_all_stock_sync(void)
1605 atomic_dec(&memcg_drain_count); 1761 atomic_dec(&memcg_drain_count);
1606} 1762}
1607 1763
1608static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, 1764/*
1765 * This function drains percpu counter value from DEAD cpu and
1766 * move it to local cpu. Note that this function can be preempted.
1767 */
1768static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
1769{
1770 int i;
1771
1772 spin_lock(&mem->pcp_counter_lock);
1773 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
1774 s64 x = per_cpu(mem->stat->count[i], cpu);
1775
1776 per_cpu(mem->stat->count[i], cpu) = 0;
1777 mem->nocpu_base.count[i] += x;
1778 }
1779 /* need to clear ON_MOVE value, works as a kind of lock. */
1780 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
1781 spin_unlock(&mem->pcp_counter_lock);
1782}
1783
1784static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
1785{
1786 int idx = MEM_CGROUP_ON_MOVE;
1787
1788 spin_lock(&mem->pcp_counter_lock);
1789 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
1790 spin_unlock(&mem->pcp_counter_lock);
1791}
1792
1793static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
1609 unsigned long action, 1794 unsigned long action,
1610 void *hcpu) 1795 void *hcpu)
1611{ 1796{
1612 int cpu = (unsigned long)hcpu; 1797 int cpu = (unsigned long)hcpu;
1613 struct memcg_stock_pcp *stock; 1798 struct memcg_stock_pcp *stock;
1799 struct mem_cgroup *iter;
1614 1800
1615 if (action != CPU_DEAD) 1801 if ((action == CPU_ONLINE)) {
1802 for_each_mem_cgroup_all(iter)
1803 synchronize_mem_cgroup_on_move(iter, cpu);
1616 return NOTIFY_OK; 1804 return NOTIFY_OK;
1805 }
1806
1807 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
1808 return NOTIFY_OK;
1809
1810 for_each_mem_cgroup_all(iter)
1811 mem_cgroup_drain_pcp_counter(iter, cpu);
1812
1617 stock = &per_cpu(memcg_stock, cpu); 1813 stock = &per_cpu(memcg_stock, cpu);
1618 drain_stock(stock); 1814 drain_stock(stock);
1619 return NOTIFY_OK; 1815 return NOTIFY_OK;
@@ -1964,7 +2160,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
1964{ 2160{
1965 VM_BUG_ON(from == to); 2161 VM_BUG_ON(from == to);
1966 VM_BUG_ON(PageLRU(pc->page)); 2162 VM_BUG_ON(PageLRU(pc->page));
1967 VM_BUG_ON(!PageCgroupLocked(pc)); 2163 VM_BUG_ON(!page_is_cgroup_locked(pc));
1968 VM_BUG_ON(!PageCgroupUsed(pc)); 2164 VM_BUG_ON(!PageCgroupUsed(pc));
1969 VM_BUG_ON(pc->mem_cgroup != from); 2165 VM_BUG_ON(pc->mem_cgroup != from);
1970 2166
@@ -3038,6 +3234,7 @@ move_account:
3038 lru_add_drain_all(); 3234 lru_add_drain_all();
3039 drain_all_stock_sync(); 3235 drain_all_stock_sync();
3040 ret = 0; 3236 ret = 0;
3237 mem_cgroup_start_move(mem);
3041 for_each_node_state(node, N_HIGH_MEMORY) { 3238 for_each_node_state(node, N_HIGH_MEMORY) {
3042 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3239 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3043 enum lru_list l; 3240 enum lru_list l;
@@ -3051,6 +3248,7 @@ move_account:
3051 if (ret) 3248 if (ret)
3052 break; 3249 break;
3053 } 3250 }
3251 mem_cgroup_end_move(mem);
3054 memcg_oom_recover(mem); 3252 memcg_oom_recover(mem);
3055 /* it seems parent cgroup doesn't have enough mem */ 3253 /* it seems parent cgroup doesn't have enough mem */
3056 if (ret == -ENOMEM) 3254 if (ret == -ENOMEM)
@@ -3137,33 +3335,25 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3137 return retval; 3335 return retval;
3138} 3336}
3139 3337
3140struct mem_cgroup_idx_data {
3141 s64 val;
3142 enum mem_cgroup_stat_index idx;
3143};
3144 3338
3145static int 3339static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
3146mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 3340 enum mem_cgroup_stat_index idx)
3147{ 3341{
3148 struct mem_cgroup_idx_data *d = data; 3342 struct mem_cgroup *iter;
3149 d->val += mem_cgroup_read_stat(mem, d->idx); 3343 s64 val = 0;
3150 return 0;
3151}
3152 3344
3153static void 3345 /* each per cpu's value can be minus.Then, use s64 */
3154mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 3346 for_each_mem_cgroup_tree(iter, mem)
3155 enum mem_cgroup_stat_index idx, s64 *val) 3347 val += mem_cgroup_read_stat(iter, idx);
3156{ 3348
3157 struct mem_cgroup_idx_data d; 3349 if (val < 0) /* race ? */
3158 d.idx = idx; 3350 val = 0;
3159 d.val = 0; 3351 return val;
3160 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
3161 *val = d.val;
3162} 3352}
3163 3353
3164static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3354static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3165{ 3355{
3166 u64 idx_val, val; 3356 u64 val;
3167 3357
3168 if (!mem_cgroup_is_root(mem)) { 3358 if (!mem_cgroup_is_root(mem)) {
3169 if (!swap) 3359 if (!swap)
@@ -3172,16 +3362,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3172 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3362 return res_counter_read_u64(&mem->memsw, RES_USAGE);
3173 } 3363 }
3174 3364
3175 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); 3365 val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE);
3176 val = idx_val; 3366 val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS);
3177 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
3178 val += idx_val;
3179 3367
3180 if (swap) { 3368 if (swap)
3181 mem_cgroup_get_recursive_idx_stat(mem, 3369 val += mem_cgroup_get_recursive_idx_stat(mem,
3182 MEM_CGROUP_STAT_SWAPOUT, &idx_val); 3370 MEM_CGROUP_STAT_SWAPOUT);
3183 val += idx_val;
3184 }
3185 3371
3186 return val << PAGE_SHIFT; 3372 return val << PAGE_SHIFT;
3187} 3373}
@@ -3389,9 +3575,9 @@ struct {
3389}; 3575};
3390 3576
3391 3577
3392static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) 3578static void
3579mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3393{ 3580{
3394 struct mcs_total_stat *s = data;
3395 s64 val; 3581 s64 val;
3396 3582
3397 /* per cpu stat */ 3583 /* per cpu stat */
@@ -3421,13 +3607,15 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
3421 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 3607 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
3422 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 3608 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
3423 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 3609 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
3424 return 0;
3425} 3610}
3426 3611
3427static void 3612static void
3428mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 3613mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3429{ 3614{
3430 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); 3615 struct mem_cgroup *iter;
3616
3617 for_each_mem_cgroup_tree(iter, mem)
3618 mem_cgroup_get_local_stat(iter, s);
3431} 3619}
3432 3620
3433static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 3621static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
@@ -3604,7 +3792,7 @@ static int compare_thresholds(const void *a, const void *b)
3604 return _a->threshold - _b->threshold; 3792 return _a->threshold - _b->threshold;
3605} 3793}
3606 3794
3607static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) 3795static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
3608{ 3796{
3609 struct mem_cgroup_eventfd_list *ev; 3797 struct mem_cgroup_eventfd_list *ev;
3610 3798
@@ -3615,7 +3803,10 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
3615 3803
3616static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 3804static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
3617{ 3805{
3618 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); 3806 struct mem_cgroup *iter;
3807
3808 for_each_mem_cgroup_tree(iter, mem)
3809 mem_cgroup_oom_notify_cb(iter);
3619} 3810}
3620 3811
3621static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 3812static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
@@ -4025,14 +4216,17 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
4025 4216
4026 memset(mem, 0, size); 4217 memset(mem, 0, size);
4027 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4218 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4028 if (!mem->stat) { 4219 if (!mem->stat)
4029 if (size < PAGE_SIZE) 4220 goto out_free;
4030 kfree(mem); 4221 spin_lock_init(&mem->pcp_counter_lock);
4031 else
4032 vfree(mem);
4033 mem = NULL;
4034 }
4035 return mem; 4222 return mem;
4223
4224out_free:
4225 if (size < PAGE_SIZE)
4226 kfree(mem);
4227 else
4228 vfree(mem);
4229 return NULL;
4036} 4230}
4037 4231
4038/* 4232/*
@@ -4158,7 +4352,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4158 &per_cpu(memcg_stock, cpu); 4352 &per_cpu(memcg_stock, cpu);
4159 INIT_WORK(&stock->work, drain_local_stock); 4353 INIT_WORK(&stock->work, drain_local_stock);
4160 } 4354 }
4161 hotcpu_notifier(memcg_stock_cpu_callback, 0); 4355 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4162 } else { 4356 } else {
4163 parent = mem_cgroup_from_cont(cont->parent); 4357 parent = mem_cgroup_from_cont(cont->parent);
4164 mem->use_hierarchy = parent->use_hierarchy; 4358 mem->use_hierarchy = parent->use_hierarchy;
@@ -4445,7 +4639,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4445 unsigned long precharge; 4639 unsigned long precharge;
4446 struct vm_area_struct *vma; 4640 struct vm_area_struct *vma;
4447 4641
4448 down_read(&mm->mmap_sem); 4642 /* We've already held the mmap_sem */
4449 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4643 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4450 struct mm_walk mem_cgroup_count_precharge_walk = { 4644 struct mm_walk mem_cgroup_count_precharge_walk = {
4451 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4645 .pmd_entry = mem_cgroup_count_precharge_pte_range,
@@ -4457,7 +4651,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4457 walk_page_range(vma->vm_start, vma->vm_end, 4651 walk_page_range(vma->vm_start, vma->vm_end,
4458 &mem_cgroup_count_precharge_walk); 4652 &mem_cgroup_count_precharge_walk);
4459 } 4653 }
4460 up_read(&mm->mmap_sem);
4461 4654
4462 precharge = mc.precharge; 4655 precharge = mc.precharge;
4463 mc.precharge = 0; 4656 mc.precharge = 0;
@@ -4508,11 +4701,17 @@ static void mem_cgroup_clear_mc(void)
4508 4701
4509 mc.moved_swap = 0; 4702 mc.moved_swap = 0;
4510 } 4703 }
4704 if (mc.mm) {
4705 up_read(&mc.mm->mmap_sem);
4706 mmput(mc.mm);
4707 }
4511 spin_lock(&mc.lock); 4708 spin_lock(&mc.lock);
4512 mc.from = NULL; 4709 mc.from = NULL;
4513 mc.to = NULL; 4710 mc.to = NULL;
4514 mc.moving_task = NULL;
4515 spin_unlock(&mc.lock); 4711 spin_unlock(&mc.lock);
4712 mc.moving_task = NULL;
4713 mc.mm = NULL;
4714 mem_cgroup_end_move(from);
4516 memcg_oom_recover(from); 4715 memcg_oom_recover(from);
4517 memcg_oom_recover(to); 4716 memcg_oom_recover(to);
4518 wake_up_all(&mc.waitq); 4717 wake_up_all(&mc.waitq);
@@ -4537,26 +4736,38 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4537 return 0; 4736 return 0;
4538 /* We move charges only when we move a owner of the mm */ 4737 /* We move charges only when we move a owner of the mm */
4539 if (mm->owner == p) { 4738 if (mm->owner == p) {
4739 /*
4740 * We do all the move charge works under one mmap_sem to
4741 * avoid deadlock with down_write(&mmap_sem)
4742 * -> try_charge() -> if (mc.moving_task) -> sleep.
4743 */
4744 down_read(&mm->mmap_sem);
4745
4540 VM_BUG_ON(mc.from); 4746 VM_BUG_ON(mc.from);
4541 VM_BUG_ON(mc.to); 4747 VM_BUG_ON(mc.to);
4542 VM_BUG_ON(mc.precharge); 4748 VM_BUG_ON(mc.precharge);
4543 VM_BUG_ON(mc.moved_charge); 4749 VM_BUG_ON(mc.moved_charge);
4544 VM_BUG_ON(mc.moved_swap); 4750 VM_BUG_ON(mc.moved_swap);
4545 VM_BUG_ON(mc.moving_task); 4751 VM_BUG_ON(mc.moving_task);
4752 VM_BUG_ON(mc.mm);
4753
4754 mem_cgroup_start_move(from);
4546 spin_lock(&mc.lock); 4755 spin_lock(&mc.lock);
4547 mc.from = from; 4756 mc.from = from;
4548 mc.to = mem; 4757 mc.to = mem;
4549 mc.precharge = 0; 4758 mc.precharge = 0;
4550 mc.moved_charge = 0; 4759 mc.moved_charge = 0;
4551 mc.moved_swap = 0; 4760 mc.moved_swap = 0;
4552 mc.moving_task = current;
4553 spin_unlock(&mc.lock); 4761 spin_unlock(&mc.lock);
4762 mc.moving_task = current;
4763 mc.mm = mm;
4554 4764
4555 ret = mem_cgroup_precharge_mc(mm); 4765 ret = mem_cgroup_precharge_mc(mm);
4556 if (ret) 4766 if (ret)
4557 mem_cgroup_clear_mc(); 4767 mem_cgroup_clear_mc();
4558 } 4768 /* We call up_read() and mmput() in clear_mc(). */
4559 mmput(mm); 4769 } else
4770 mmput(mm);
4560 } 4771 }
4561 return ret; 4772 return ret;
4562} 4773}
@@ -4644,7 +4855,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4644 struct vm_area_struct *vma; 4855 struct vm_area_struct *vma;
4645 4856
4646 lru_add_drain_all(); 4857 lru_add_drain_all();
4647 down_read(&mm->mmap_sem); 4858 /* We've already held the mmap_sem */
4648 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4859 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4649 int ret; 4860 int ret;
4650 struct mm_walk mem_cgroup_move_charge_walk = { 4861 struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4663,7 +4874,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4663 */ 4874 */
4664 break; 4875 break;
4665 } 4876 }
4666 up_read(&mm->mmap_sem);
4667} 4877}
4668 4878
4669static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4879static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -4672,17 +4882,11 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4672 struct task_struct *p, 4882 struct task_struct *p,
4673 bool threadgroup) 4883 bool threadgroup)
4674{ 4884{
4675 struct mm_struct *mm; 4885 if (!mc.mm)
4676
4677 if (!mc.to)
4678 /* no need to move charge */ 4886 /* no need to move charge */
4679 return; 4887 return;
4680 4888
4681 mm = get_task_mm(p); 4889 mem_cgroup_move_charge(mc.mm);
4682 if (mm) {
4683 mem_cgroup_move_charge(mm);
4684 mmput(mm);
4685 }
4686 mem_cgroup_clear_mc(); 4890 mem_cgroup_clear_mc();
4687} 4891}
4688#else /* !CONFIG_MMU */ 4892#else /* !CONFIG_MMU */
@@ -4723,10 +4927,20 @@ struct cgroup_subsys mem_cgroup_subsys = {
4723}; 4927};
4724 4928
4725#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4929#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4930static int __init enable_swap_account(char *s)
4931{
4932 /* consider enabled if no parameter or 1 is given */
4933 if (!s || !strcmp(s, "1"))
4934 really_do_swap_account = 1;
4935 else if (!strcmp(s, "0"))
4936 really_do_swap_account = 0;
4937 return 1;
4938}
4939__setup("swapaccount", enable_swap_account);
4726 4940
4727static int __init disable_swap_account(char *s) 4941static int __init disable_swap_account(char *s)
4728{ 4942{
4729 really_do_swap_account = 0; 4943 enable_swap_account("0");
4730 return 1; 4944 return 1;
4731} 4945}
4732__setup("noswapaccount", disable_swap_account); 4946__setup("noswapaccount", disable_swap_account);