aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/highmem.c6
-rw-r--r--mm/memcontrol.c406
-rw-r--r--mm/swap.c1
3 files changed, 302 insertions, 111 deletions
diff --git a/mm/highmem.c b/mm/highmem.c
index 781e754a75ac..693394daa2ed 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,6 +29,11 @@
29#include <linux/kgdb.h> 29#include <linux/kgdb.h>
30#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
31 31
32
33#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
34DEFINE_PER_CPU(int, __kmap_atomic_idx);
35#endif
36
32/* 37/*
33 * Virtual_count is not a pure "count". 38 * Virtual_count is not a pure "count".
34 * 0 means that it is not mapped, and has not been mapped 39 * 0 means that it is not mapped, and has not been mapped
@@ -43,7 +48,6 @@ unsigned long totalhigh_pages __read_mostly;
43EXPORT_SYMBOL(totalhigh_pages); 48EXPORT_SYMBOL(totalhigh_pages);
44 49
45 50
46DEFINE_PER_CPU(int, __kmap_atomic_idx);
47EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); 51EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
48 52
49unsigned int nr_free_highpages (void) 53unsigned int nr_free_highpages (void)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9be3cf8a5da4..9a99cfaf0a19 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -89,7 +89,10 @@ enum mem_cgroup_stat_index {
89 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 89 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
90 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 90 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
91 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 91 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
92 MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ 92 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
93 /* incremented at every pagein/pageout */
94 MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
95 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
93 96
94 MEM_CGROUP_STAT_NSTATS, 97 MEM_CGROUP_STAT_NSTATS,
95}; 98};
@@ -254,6 +257,12 @@ struct mem_cgroup {
254 * percpu counter. 257 * percpu counter.
255 */ 258 */
256 struct mem_cgroup_stat_cpu *stat; 259 struct mem_cgroup_stat_cpu *stat;
260 /*
261 * used when a cpu is offlined or other synchronizations
262 * See mem_cgroup_read_stat().
263 */
264 struct mem_cgroup_stat_cpu nocpu_base;
265 spinlock_t pcp_counter_lock;
257}; 266};
258 267
259/* Stuffs for move charges at task migration. */ 268/* Stuffs for move charges at task migration. */
@@ -530,14 +539,40 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
530 return mz; 539 return mz;
531} 540}
532 541
542/*
543 * Implementation Note: reading percpu statistics for memcg.
544 *
545 * Both of vmstat[] and percpu_counter has threshold and do periodic
546 * synchronization to implement "quick" read. There are trade-off between
547 * reading cost and precision of value. Then, we may have a chance to implement
548 * a periodic synchronizion of counter in memcg's counter.
549 *
550 * But this _read() function is used for user interface now. The user accounts
551 * memory usage by memory cgroup and he _always_ requires exact value because
552 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
553 * have to visit all online cpus and make sum. So, for now, unnecessary
554 * synchronization is not implemented. (just implemented for cpu hotplug)
555 *
556 * If there are kernel internal actions which can make use of some not-exact
557 * value, and reading all cpu value can be performance bottleneck in some
558 * common workload, threashold and synchonization as vmstat[] should be
559 * implemented.
560 */
533static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, 561static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
534 enum mem_cgroup_stat_index idx) 562 enum mem_cgroup_stat_index idx)
535{ 563{
536 int cpu; 564 int cpu;
537 s64 val = 0; 565 s64 val = 0;
538 566
539 for_each_possible_cpu(cpu) 567 get_online_cpus();
568 for_each_online_cpu(cpu)
540 val += per_cpu(mem->stat->count[idx], cpu); 569 val += per_cpu(mem->stat->count[idx], cpu);
570#ifdef CONFIG_HOTPLUG_CPU
571 spin_lock(&mem->pcp_counter_lock);
572 val += mem->nocpu_base.count[idx];
573 spin_unlock(&mem->pcp_counter_lock);
574#endif
575 put_online_cpus();
541 return val; 576 return val;
542} 577}
543 578
@@ -659,40 +694,83 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
659 return mem; 694 return mem;
660} 695}
661 696
662/* 697/* The caller has to guarantee "mem" exists before calling this */
663 * Call callback function against all cgroup under hierarchy tree. 698static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
664 */
665static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
666 int (*func)(struct mem_cgroup *, void *))
667{ 699{
668 int found, ret, nextid;
669 struct cgroup_subsys_state *css; 700 struct cgroup_subsys_state *css;
670 struct mem_cgroup *mem; 701 int found;
671
672 if (!root->use_hierarchy)
673 return (*func)(root, data);
674 702
675 nextid = 1; 703 if (!mem) /* ROOT cgroup has the smallest ID */
676 do { 704 return root_mem_cgroup; /*css_put/get against root is ignored*/
677 ret = 0; 705 if (!mem->use_hierarchy) {
706 if (css_tryget(&mem->css))
707 return mem;
708 return NULL;
709 }
710 rcu_read_lock();
711 /*
712 * searching a memory cgroup which has the smallest ID under given
713 * ROOT cgroup. (ID >= 1)
714 */
715 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
716 if (css && css_tryget(css))
717 mem = container_of(css, struct mem_cgroup, css);
718 else
678 mem = NULL; 719 mem = NULL;
720 rcu_read_unlock();
721 return mem;
722}
723
724static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
725 struct mem_cgroup *root,
726 bool cond)
727{
728 int nextid = css_id(&iter->css) + 1;
729 int found;
730 int hierarchy_used;
731 struct cgroup_subsys_state *css;
732
733 hierarchy_used = iter->use_hierarchy;
679 734
735 css_put(&iter->css);
736 /* If no ROOT, walk all, ignore hierarchy */
737 if (!cond || (root && !hierarchy_used))
738 return NULL;
739
740 if (!root)
741 root = root_mem_cgroup;
742
743 do {
744 iter = NULL;
680 rcu_read_lock(); 745 rcu_read_lock();
681 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, 746
682 &found); 747 css = css_get_next(&mem_cgroup_subsys, nextid,
748 &root->css, &found);
683 if (css && css_tryget(css)) 749 if (css && css_tryget(css))
684 mem = container_of(css, struct mem_cgroup, css); 750 iter = container_of(css, struct mem_cgroup, css);
685 rcu_read_unlock(); 751 rcu_read_unlock();
686 752 /* If css is NULL, no more cgroups will be found */
687 if (mem) {
688 ret = (*func)(mem, data);
689 css_put(&mem->css);
690 }
691 nextid = found + 1; 753 nextid = found + 1;
692 } while (!ret && css); 754 } while (css && !iter);
693 755
694 return ret; 756 return iter;
695} 757}
758/*
759 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
760 * be careful that "break" loop is not allowed. We have reference count.
761 * Instead of that modify "cond" to be false and "continue" to exit the loop.
762 */
763#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
764 for (iter = mem_cgroup_start_loop(root);\
765 iter != NULL;\
766 iter = mem_cgroup_get_next(iter, root, cond))
767
768#define for_each_mem_cgroup_tree(iter, root) \
769 for_each_mem_cgroup_tree_cond(iter, root, true)
770
771#define for_each_mem_cgroup_all(iter) \
772 for_each_mem_cgroup_tree_cond(iter, NULL, true)
773
696 774
697static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 775static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
698{ 776{
@@ -1051,7 +1129,52 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
1051 return swappiness; 1129 return swappiness;
1052} 1130}
1053 1131
1054/* A routine for testing mem is not under move_account */ 1132static void mem_cgroup_start_move(struct mem_cgroup *mem)
1133{
1134 int cpu;
1135
1136 get_online_cpus();
1137 spin_lock(&mem->pcp_counter_lock);
1138 for_each_online_cpu(cpu)
1139 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1140 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1141 spin_unlock(&mem->pcp_counter_lock);
1142 put_online_cpus();
1143
1144 synchronize_rcu();
1145}
1146
1147static void mem_cgroup_end_move(struct mem_cgroup *mem)
1148{
1149 int cpu;
1150
1151 if (!mem)
1152 return;
1153 get_online_cpus();
1154 spin_lock(&mem->pcp_counter_lock);
1155 for_each_online_cpu(cpu)
1156 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1157 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1158 spin_unlock(&mem->pcp_counter_lock);
1159 put_online_cpus();
1160}
1161/*
1162 * 2 routines for checking "mem" is under move_account() or not.
1163 *
1164 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
1165 * for avoiding race in accounting. If true,
1166 * pc->mem_cgroup may be overwritten.
1167 *
1168 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1169 * under hierarchy of moving cgroups. This is for
1170 * waiting at hith-memory prressure caused by "move".
1171 */
1172
1173static bool mem_cgroup_stealed(struct mem_cgroup *mem)
1174{
1175 VM_BUG_ON(!rcu_read_lock_held());
1176 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1177}
1055 1178
1056static bool mem_cgroup_under_move(struct mem_cgroup *mem) 1179static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1057{ 1180{
@@ -1092,13 +1215,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1092 return false; 1215 return false;
1093} 1216}
1094 1217
1095static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1096{
1097 int *val = data;
1098 (*val)++;
1099 return 0;
1100}
1101
1102/** 1218/**
1103 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1219 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1104 * @memcg: The memory cgroup that went over limit 1220 * @memcg: The memory cgroup that went over limit
@@ -1173,7 +1289,10 @@ done:
1173static int mem_cgroup_count_children(struct mem_cgroup *mem) 1289static int mem_cgroup_count_children(struct mem_cgroup *mem)
1174{ 1290{
1175 int num = 0; 1291 int num = 0;
1176 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); 1292 struct mem_cgroup *iter;
1293
1294 for_each_mem_cgroup_tree(iter, mem)
1295 num++;
1177 return num; 1296 return num;
1178} 1297}
1179 1298
@@ -1322,49 +1441,39 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1322 return total; 1441 return total;
1323} 1442}
1324 1443
1325static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1326{
1327 int *val = (int *)data;
1328 int x;
1329 /*
1330 * Logically, we can stop scanning immediately when we find
1331 * a memcg is already locked. But condidering unlock ops and
1332 * creation/removal of memcg, scan-all is simple operation.
1333 */
1334 x = atomic_inc_return(&mem->oom_lock);
1335 *val = max(x, *val);
1336 return 0;
1337}
1338/* 1444/*
1339 * Check OOM-Killer is already running under our hierarchy. 1445 * Check OOM-Killer is already running under our hierarchy.
1340 * If someone is running, return false. 1446 * If someone is running, return false.
1341 */ 1447 */
1342static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1448static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1343{ 1449{
1344 int lock_count = 0; 1450 int x, lock_count = 0;
1451 struct mem_cgroup *iter;
1345 1452
1346 mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); 1453 for_each_mem_cgroup_tree(iter, mem) {
1454 x = atomic_inc_return(&iter->oom_lock);
1455 lock_count = max(x, lock_count);
1456 }
1347 1457
1348 if (lock_count == 1) 1458 if (lock_count == 1)
1349 return true; 1459 return true;
1350 return false; 1460 return false;
1351} 1461}
1352 1462
1353static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) 1463static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1354{ 1464{
1465 struct mem_cgroup *iter;
1466
1355 /* 1467 /*
1356 * When a new child is created while the hierarchy is under oom, 1468 * When a new child is created while the hierarchy is under oom,
1357 * mem_cgroup_oom_lock() may not be called. We have to use 1469 * mem_cgroup_oom_lock() may not be called. We have to use
1358 * atomic_add_unless() here. 1470 * atomic_add_unless() here.
1359 */ 1471 */
1360 atomic_add_unless(&mem->oom_lock, -1, 0); 1472 for_each_mem_cgroup_tree(iter, mem)
1473 atomic_add_unless(&iter->oom_lock, -1, 0);
1361 return 0; 1474 return 0;
1362} 1475}
1363 1476
1364static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1365{
1366 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
1367}
1368 1477
1369static DEFINE_MUTEX(memcg_oom_mutex); 1478static DEFINE_MUTEX(memcg_oom_mutex);
1370static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1479static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1462,34 +1571,73 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1462/* 1571/*
1463 * Currently used to update mapped file statistics, but the routine can be 1572 * Currently used to update mapped file statistics, but the routine can be
1464 * generalized to update other statistics as well. 1573 * generalized to update other statistics as well.
1574 *
1575 * Notes: Race condition
1576 *
1577 * We usually use page_cgroup_lock() for accessing page_cgroup member but
1578 * it tends to be costly. But considering some conditions, we doesn't need
1579 * to do so _always_.
1580 *
1581 * Considering "charge", lock_page_cgroup() is not required because all
1582 * file-stat operations happen after a page is attached to radix-tree. There
1583 * are no race with "charge".
1584 *
1585 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
1586 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
1587 * if there are race with "uncharge". Statistics itself is properly handled
1588 * by flags.
1589 *
1590 * Considering "move", this is an only case we see a race. To make the race
1591 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
1592 * possibility of race condition. If there is, we take a lock.
1465 */ 1593 */
1466void mem_cgroup_update_file_mapped(struct page *page, int val) 1594
1595static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
1467{ 1596{
1468 struct mem_cgroup *mem; 1597 struct mem_cgroup *mem;
1469 struct page_cgroup *pc; 1598 struct page_cgroup *pc = lookup_page_cgroup(page);
1599 bool need_unlock = false;
1470 1600
1471 pc = lookup_page_cgroup(page);
1472 if (unlikely(!pc)) 1601 if (unlikely(!pc))
1473 return; 1602 return;
1474 1603
1475 lock_page_cgroup(pc); 1604 rcu_read_lock();
1476 mem = pc->mem_cgroup; 1605 mem = pc->mem_cgroup;
1477 if (!mem || !PageCgroupUsed(pc)) 1606 if (unlikely(!mem || !PageCgroupUsed(pc)))
1478 goto done; 1607 goto out;
1608 /* pc->mem_cgroup is unstable ? */
1609 if (unlikely(mem_cgroup_stealed(mem))) {
1610 /* take a lock against to access pc->mem_cgroup */
1611 lock_page_cgroup(pc);
1612 need_unlock = true;
1613 mem = pc->mem_cgroup;
1614 if (!mem || !PageCgroupUsed(pc))
1615 goto out;
1616 }
1479 1617
1480 /* 1618 this_cpu_add(mem->stat->count[idx], val);
1481 * Preemption is already disabled. We can use __this_cpu_xxx 1619
1482 */ 1620 switch (idx) {
1483 if (val > 0) { 1621 case MEM_CGROUP_STAT_FILE_MAPPED:
1484 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1622 if (val > 0)
1485 SetPageCgroupFileMapped(pc); 1623 SetPageCgroupFileMapped(pc);
1486 } else { 1624 else if (!page_mapped(page))
1487 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1625 ClearPageCgroupFileMapped(pc);
1488 ClearPageCgroupFileMapped(pc); 1626 break;
1627 default:
1628 BUG();
1489 } 1629 }
1490 1630
1491done: 1631out:
1492 unlock_page_cgroup(pc); 1632 if (unlikely(need_unlock))
1633 unlock_page_cgroup(pc);
1634 rcu_read_unlock();
1635 return;
1636}
1637
1638void mem_cgroup_update_file_mapped(struct page *page, int val)
1639{
1640 mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
1493} 1641}
1494 1642
1495/* 1643/*
@@ -1605,15 +1753,55 @@ static void drain_all_stock_sync(void)
1605 atomic_dec(&memcg_drain_count); 1753 atomic_dec(&memcg_drain_count);
1606} 1754}
1607 1755
1608static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, 1756/*
1757 * This function drains percpu counter value from DEAD cpu and
1758 * move it to local cpu. Note that this function can be preempted.
1759 */
1760static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
1761{
1762 int i;
1763
1764 spin_lock(&mem->pcp_counter_lock);
1765 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
1766 s64 x = per_cpu(mem->stat->count[i], cpu);
1767
1768 per_cpu(mem->stat->count[i], cpu) = 0;
1769 mem->nocpu_base.count[i] += x;
1770 }
1771 /* need to clear ON_MOVE value, works as a kind of lock. */
1772 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
1773 spin_unlock(&mem->pcp_counter_lock);
1774}
1775
1776static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
1777{
1778 int idx = MEM_CGROUP_ON_MOVE;
1779
1780 spin_lock(&mem->pcp_counter_lock);
1781 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
1782 spin_unlock(&mem->pcp_counter_lock);
1783}
1784
1785static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
1609 unsigned long action, 1786 unsigned long action,
1610 void *hcpu) 1787 void *hcpu)
1611{ 1788{
1612 int cpu = (unsigned long)hcpu; 1789 int cpu = (unsigned long)hcpu;
1613 struct memcg_stock_pcp *stock; 1790 struct memcg_stock_pcp *stock;
1791 struct mem_cgroup *iter;
1792
1793 if ((action == CPU_ONLINE)) {
1794 for_each_mem_cgroup_all(iter)
1795 synchronize_mem_cgroup_on_move(iter, cpu);
1796 return NOTIFY_OK;
1797 }
1614 1798
1615 if (action != CPU_DEAD) 1799 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
1616 return NOTIFY_OK; 1800 return NOTIFY_OK;
1801
1802 for_each_mem_cgroup_all(iter)
1803 mem_cgroup_drain_pcp_counter(iter, cpu);
1804
1617 stock = &per_cpu(memcg_stock, cpu); 1805 stock = &per_cpu(memcg_stock, cpu);
1618 drain_stock(stock); 1806 drain_stock(stock);
1619 return NOTIFY_OK; 1807 return NOTIFY_OK;
@@ -3038,6 +3226,7 @@ move_account:
3038 lru_add_drain_all(); 3226 lru_add_drain_all();
3039 drain_all_stock_sync(); 3227 drain_all_stock_sync();
3040 ret = 0; 3228 ret = 0;
3229 mem_cgroup_start_move(mem);
3041 for_each_node_state(node, N_HIGH_MEMORY) { 3230 for_each_node_state(node, N_HIGH_MEMORY) {
3042 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3231 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3043 enum lru_list l; 3232 enum lru_list l;
@@ -3051,6 +3240,7 @@ move_account:
3051 if (ret) 3240 if (ret)
3052 break; 3241 break;
3053 } 3242 }
3243 mem_cgroup_end_move(mem);
3054 memcg_oom_recover(mem); 3244 memcg_oom_recover(mem);
3055 /* it seems parent cgroup doesn't have enough mem */ 3245 /* it seems parent cgroup doesn't have enough mem */
3056 if (ret == -ENOMEM) 3246 if (ret == -ENOMEM)
@@ -3137,33 +3327,25 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3137 return retval; 3327 return retval;
3138} 3328}
3139 3329
3140struct mem_cgroup_idx_data {
3141 s64 val;
3142 enum mem_cgroup_stat_index idx;
3143};
3144 3330
3145static int 3331static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
3146mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 3332 enum mem_cgroup_stat_index idx)
3147{ 3333{
3148 struct mem_cgroup_idx_data *d = data; 3334 struct mem_cgroup *iter;
3149 d->val += mem_cgroup_read_stat(mem, d->idx); 3335 s64 val = 0;
3150 return 0;
3151}
3152 3336
3153static void 3337 /* each per cpu's value can be minus.Then, use s64 */
3154mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 3338 for_each_mem_cgroup_tree(iter, mem)
3155 enum mem_cgroup_stat_index idx, s64 *val) 3339 val += mem_cgroup_read_stat(iter, idx);
3156{ 3340
3157 struct mem_cgroup_idx_data d; 3341 if (val < 0) /* race ? */
3158 d.idx = idx; 3342 val = 0;
3159 d.val = 0; 3343 return val;
3160 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
3161 *val = d.val;
3162} 3344}
3163 3345
3164static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3346static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3165{ 3347{
3166 u64 idx_val, val; 3348 u64 val;
3167 3349
3168 if (!mem_cgroup_is_root(mem)) { 3350 if (!mem_cgroup_is_root(mem)) {
3169 if (!swap) 3351 if (!swap)
@@ -3172,16 +3354,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3172 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3354 return res_counter_read_u64(&mem->memsw, RES_USAGE);
3173 } 3355 }
3174 3356
3175 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); 3357 val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE);
3176 val = idx_val; 3358 val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS);
3177 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
3178 val += idx_val;
3179 3359
3180 if (swap) { 3360 if (swap)
3181 mem_cgroup_get_recursive_idx_stat(mem, 3361 val += mem_cgroup_get_recursive_idx_stat(mem,
3182 MEM_CGROUP_STAT_SWAPOUT, &idx_val); 3362 MEM_CGROUP_STAT_SWAPOUT);
3183 val += idx_val;
3184 }
3185 3363
3186 return val << PAGE_SHIFT; 3364 return val << PAGE_SHIFT;
3187} 3365}
@@ -3389,9 +3567,9 @@ struct {
3389}; 3567};
3390 3568
3391 3569
3392static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) 3570static void
3571mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3393{ 3572{
3394 struct mcs_total_stat *s = data;
3395 s64 val; 3573 s64 val;
3396 3574
3397 /* per cpu stat */ 3575 /* per cpu stat */
@@ -3421,13 +3599,15 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
3421 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 3599 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
3422 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 3600 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
3423 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 3601 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
3424 return 0;
3425} 3602}
3426 3603
3427static void 3604static void
3428mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 3605mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3429{ 3606{
3430 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); 3607 struct mem_cgroup *iter;
3608
3609 for_each_mem_cgroup_tree(iter, mem)
3610 mem_cgroup_get_local_stat(iter, s);
3431} 3611}
3432 3612
3433static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 3613static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
@@ -3604,7 +3784,7 @@ static int compare_thresholds(const void *a, const void *b)
3604 return _a->threshold - _b->threshold; 3784 return _a->threshold - _b->threshold;
3605} 3785}
3606 3786
3607static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) 3787static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
3608{ 3788{
3609 struct mem_cgroup_eventfd_list *ev; 3789 struct mem_cgroup_eventfd_list *ev;
3610 3790
@@ -3615,7 +3795,10 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
3615 3795
3616static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 3796static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
3617{ 3797{
3618 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); 3798 struct mem_cgroup *iter;
3799
3800 for_each_mem_cgroup_tree(iter, mem)
3801 mem_cgroup_oom_notify_cb(iter);
3619} 3802}
3620 3803
3621static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 3804static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
@@ -4032,6 +4215,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
4032 vfree(mem); 4215 vfree(mem);
4033 mem = NULL; 4216 mem = NULL;
4034 } 4217 }
4218 spin_lock_init(&mem->pcp_counter_lock);
4035 return mem; 4219 return mem;
4036} 4220}
4037 4221
@@ -4158,7 +4342,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4158 &per_cpu(memcg_stock, cpu); 4342 &per_cpu(memcg_stock, cpu);
4159 INIT_WORK(&stock->work, drain_local_stock); 4343 INIT_WORK(&stock->work, drain_local_stock);
4160 } 4344 }
4161 hotcpu_notifier(memcg_stock_cpu_callback, 0); 4345 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4162 } else { 4346 } else {
4163 parent = mem_cgroup_from_cont(cont->parent); 4347 parent = mem_cgroup_from_cont(cont->parent);
4164 mem->use_hierarchy = parent->use_hierarchy; 4348 mem->use_hierarchy = parent->use_hierarchy;
@@ -4513,6 +4697,7 @@ static void mem_cgroup_clear_mc(void)
4513 mc.to = NULL; 4697 mc.to = NULL;
4514 mc.moving_task = NULL; 4698 mc.moving_task = NULL;
4515 spin_unlock(&mc.lock); 4699 spin_unlock(&mc.lock);
4700 mem_cgroup_end_move(from);
4516 memcg_oom_recover(from); 4701 memcg_oom_recover(from);
4517 memcg_oom_recover(to); 4702 memcg_oom_recover(to);
4518 wake_up_all(&mc.waitq); 4703 wake_up_all(&mc.waitq);
@@ -4543,6 +4728,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4543 VM_BUG_ON(mc.moved_charge); 4728 VM_BUG_ON(mc.moved_charge);
4544 VM_BUG_ON(mc.moved_swap); 4729 VM_BUG_ON(mc.moved_swap);
4545 VM_BUG_ON(mc.moving_task); 4730 VM_BUG_ON(mc.moving_task);
4731 mem_cgroup_start_move(from);
4546 spin_lock(&mc.lock); 4732 spin_lock(&mc.lock);
4547 mc.from = from; 4733 mc.from = from;
4548 mc.to = mem; 4734 mc.to = mem;
diff --git a/mm/swap.c b/mm/swap.c
index 3ce7bc373a52..3f4854205b16 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -378,6 +378,7 @@ void release_pages(struct page **pages, int nr, int cold)
378 378
379 pagevec_free(&pages_to_free); 379 pagevec_free(&pages_to_free);
380} 380}
381EXPORT_SYMBOL(release_pages);
381 382
382/* 383/*
383 * The pages which we're about to release may be in the deferred lru-addition 384 * The pages which we're about to release may be in the deferred lru-addition