aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c419
1 files changed, 317 insertions, 102 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e0c2066495e3..878808c4fcbe 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -38,6 +38,7 @@
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/mm_inline.h> 39#include <linux/mm_inline.h>
40#include <linux/page_cgroup.h> 40#include <linux/page_cgroup.h>
41#include <linux/cpu.h>
41#include "internal.h" 42#include "internal.h"
42 43
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
@@ -54,7 +55,6 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
54#define do_swap_account (0) 55#define do_swap_account (0)
55#endif 56#endif
56 57
57static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
58#define SOFTLIMIT_EVENTS_THRESH (1000) 58#define SOFTLIMIT_EVENTS_THRESH (1000)
59 59
60/* 60/*
@@ -66,7 +66,7 @@ enum mem_cgroup_stat_index {
66 */ 66 */
67 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 67 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
68 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 68 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
69 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ 69 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ 72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
@@ -275,6 +275,7 @@ enum charge_type {
275static void mem_cgroup_get(struct mem_cgroup *mem); 275static void mem_cgroup_get(struct mem_cgroup *mem);
276static void mem_cgroup_put(struct mem_cgroup *mem); 276static void mem_cgroup_put(struct mem_cgroup *mem);
277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
278static void drain_all_stock_async(void);
278 279
279static struct mem_cgroup_per_zone * 280static struct mem_cgroup_per_zone *
280mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 281mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -758,7 +759,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
758 task_unlock(task); 759 task_unlock(task);
759 if (!curr) 760 if (!curr)
760 return 0; 761 return 0;
761 if (curr->use_hierarchy) 762 /*
763 * We should check use_hierarchy of "mem" not "curr". Because checking
764 * use_hierarchy of "curr" here make this function true if hierarchy is
765 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
766 * hierarchy(even if use_hierarchy is disabled in "mem").
767 */
768 if (mem->use_hierarchy)
762 ret = css_is_ancestor(&curr->css, &mem->css); 769 ret = css_is_ancestor(&curr->css, &mem->css);
763 else 770 else
764 ret = (curr == mem); 771 ret = (curr == mem);
@@ -1007,7 +1014,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1007 static char memcg_name[PATH_MAX]; 1014 static char memcg_name[PATH_MAX];
1008 int ret; 1015 int ret;
1009 1016
1010 if (!memcg) 1017 if (!memcg || !p)
1011 return; 1018 return;
1012 1019
1013 1020
@@ -1137,6 +1144,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1137 victim = mem_cgroup_select_victim(root_mem); 1144 victim = mem_cgroup_select_victim(root_mem);
1138 if (victim == root_mem) { 1145 if (victim == root_mem) {
1139 loop++; 1146 loop++;
1147 if (loop >= 1)
1148 drain_all_stock_async();
1140 if (loop >= 2) { 1149 if (loop >= 2) {
1141 /* 1150 /*
1142 * If we have not been able to reclaim 1151 * If we have not been able to reclaim
@@ -1223,7 +1232,7 @@ static void record_last_oom(struct mem_cgroup *mem)
1223 * Currently used to update mapped file statistics, but the routine can be 1232 * Currently used to update mapped file statistics, but the routine can be
1224 * generalized to update other statistics as well. 1233 * generalized to update other statistics as well.
1225 */ 1234 */
1226void mem_cgroup_update_mapped_file_stat(struct page *page, int val) 1235void mem_cgroup_update_file_mapped(struct page *page, int val)
1227{ 1236{
1228 struct mem_cgroup *mem; 1237 struct mem_cgroup *mem;
1229 struct mem_cgroup_stat *stat; 1238 struct mem_cgroup_stat *stat;
@@ -1231,9 +1240,6 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
1231 int cpu; 1240 int cpu;
1232 struct page_cgroup *pc; 1241 struct page_cgroup *pc;
1233 1242
1234 if (!page_is_file_cache(page))
1235 return;
1236
1237 pc = lookup_page_cgroup(page); 1243 pc = lookup_page_cgroup(page);
1238 if (unlikely(!pc)) 1244 if (unlikely(!pc))
1239 return; 1245 return;
@@ -1253,12 +1259,139 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
1253 stat = &mem->stat; 1259 stat = &mem->stat;
1254 cpustat = &stat->cpustat[cpu]; 1260 cpustat = &stat->cpustat[cpu];
1255 1261
1256 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); 1262 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
1257done: 1263done:
1258 unlock_page_cgroup(pc); 1264 unlock_page_cgroup(pc);
1259} 1265}
1260 1266
1261/* 1267/*
1268 * size of first charge trial. "32" comes from vmscan.c's magic value.
1269 * TODO: maybe necessary to use big numbers in big irons.
1270 */
1271#define CHARGE_SIZE (32 * PAGE_SIZE)
1272struct memcg_stock_pcp {
1273 struct mem_cgroup *cached; /* this never be root cgroup */
1274 int charge;
1275 struct work_struct work;
1276};
1277static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1278static atomic_t memcg_drain_count;
1279
1280/*
1281 * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
1282 * from local stock and true is returned. If the stock is 0 or charges from a
1283 * cgroup which is not current target, returns false. This stock will be
1284 * refilled.
1285 */
1286static bool consume_stock(struct mem_cgroup *mem)
1287{
1288 struct memcg_stock_pcp *stock;
1289 bool ret = true;
1290
1291 stock = &get_cpu_var(memcg_stock);
1292 if (mem == stock->cached && stock->charge)
1293 stock->charge -= PAGE_SIZE;
1294 else /* need to call res_counter_charge */
1295 ret = false;
1296 put_cpu_var(memcg_stock);
1297 return ret;
1298}
1299
1300/*
1301 * Returns stocks cached in percpu to res_counter and reset cached information.
1302 */
1303static void drain_stock(struct memcg_stock_pcp *stock)
1304{
1305 struct mem_cgroup *old = stock->cached;
1306
1307 if (stock->charge) {
1308 res_counter_uncharge(&old->res, stock->charge);
1309 if (do_swap_account)
1310 res_counter_uncharge(&old->memsw, stock->charge);
1311 }
1312 stock->cached = NULL;
1313 stock->charge = 0;
1314}
1315
1316/*
1317 * This must be called under preempt disabled or must be called by
1318 * a thread which is pinned to local cpu.
1319 */
1320static void drain_local_stock(struct work_struct *dummy)
1321{
1322 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1323 drain_stock(stock);
1324}
1325
1326/*
1327 * Cache charges(val) which is from res_counter, to local per_cpu area.
1328 * This will be consumed by consumt_stock() function, later.
1329 */
1330static void refill_stock(struct mem_cgroup *mem, int val)
1331{
1332 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1333
1334 if (stock->cached != mem) { /* reset if necessary */
1335 drain_stock(stock);
1336 stock->cached = mem;
1337 }
1338 stock->charge += val;
1339 put_cpu_var(memcg_stock);
1340}
1341
1342/*
1343 * Tries to drain stocked charges in other cpus. This function is asynchronous
1344 * and just put a work per cpu for draining localy on each cpu. Caller can
1345 * expects some charges will be back to res_counter later but cannot wait for
1346 * it.
1347 */
1348static void drain_all_stock_async(void)
1349{
1350 int cpu;
1351 /* This function is for scheduling "drain" in asynchronous way.
1352 * The result of "drain" is not directly handled by callers. Then,
1353 * if someone is calling drain, we don't have to call drain more.
1354 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
1355 * there is a race. We just do loose check here.
1356 */
1357 if (atomic_read(&memcg_drain_count))
1358 return;
1359 /* Notify other cpus that system-wide "drain" is running */
1360 atomic_inc(&memcg_drain_count);
1361 get_online_cpus();
1362 for_each_online_cpu(cpu) {
1363 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1364 schedule_work_on(cpu, &stock->work);
1365 }
1366 put_online_cpus();
1367 atomic_dec(&memcg_drain_count);
1368 /* We don't wait for flush_work */
1369}
1370
1371/* This is a synchronous drain interface. */
1372static void drain_all_stock_sync(void)
1373{
1374 /* called when force_empty is called */
1375 atomic_inc(&memcg_drain_count);
1376 schedule_on_each_cpu(drain_local_stock);
1377 atomic_dec(&memcg_drain_count);
1378}
1379
1380static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1381 unsigned long action,
1382 void *hcpu)
1383{
1384 int cpu = (unsigned long)hcpu;
1385 struct memcg_stock_pcp *stock;
1386
1387 if (action != CPU_DEAD)
1388 return NOTIFY_OK;
1389 stock = &per_cpu(memcg_stock, cpu);
1390 drain_stock(stock);
1391 return NOTIFY_OK;
1392}
1393
1394/*
1262 * Unlike exported interface, "oom" parameter is added. if oom==true, 1395 * Unlike exported interface, "oom" parameter is added. if oom==true,
1263 * oom-killer can be invoked. 1396 * oom-killer can be invoked.
1264 */ 1397 */
@@ -1269,6 +1402,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1269 struct mem_cgroup *mem, *mem_over_limit; 1402 struct mem_cgroup *mem, *mem_over_limit;
1270 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1403 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1271 struct res_counter *fail_res; 1404 struct res_counter *fail_res;
1405 int csize = CHARGE_SIZE;
1272 1406
1273 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1407 if (unlikely(test_thread_flag(TIF_MEMDIE))) {
1274 /* Don't account this! */ 1408 /* Don't account this! */
@@ -1293,23 +1427,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1293 return 0; 1427 return 0;
1294 1428
1295 VM_BUG_ON(css_is_removed(&mem->css)); 1429 VM_BUG_ON(css_is_removed(&mem->css));
1430 if (mem_cgroup_is_root(mem))
1431 goto done;
1296 1432
1297 while (1) { 1433 while (1) {
1298 int ret = 0; 1434 int ret = 0;
1299 unsigned long flags = 0; 1435 unsigned long flags = 0;
1300 1436
1301 if (mem_cgroup_is_root(mem)) 1437 if (consume_stock(mem))
1302 goto done; 1438 goto charged;
1303 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 1439
1440 ret = res_counter_charge(&mem->res, csize, &fail_res);
1304 if (likely(!ret)) { 1441 if (likely(!ret)) {
1305 if (!do_swap_account) 1442 if (!do_swap_account)
1306 break; 1443 break;
1307 ret = res_counter_charge(&mem->memsw, PAGE_SIZE, 1444 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1308 &fail_res);
1309 if (likely(!ret)) 1445 if (likely(!ret))
1310 break; 1446 break;
1311 /* mem+swap counter fails */ 1447 /* mem+swap counter fails */
1312 res_counter_uncharge(&mem->res, PAGE_SIZE); 1448 res_counter_uncharge(&mem->res, csize);
1313 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1449 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1314 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1450 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1315 memsw); 1451 memsw);
@@ -1318,6 +1454,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1318 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1454 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1319 res); 1455 res);
1320 1456
1457 /* reduce request size and retry */
1458 if (csize > PAGE_SIZE) {
1459 csize = PAGE_SIZE;
1460 continue;
1461 }
1321 if (!(gfp_mask & __GFP_WAIT)) 1462 if (!(gfp_mask & __GFP_WAIT))
1322 goto nomem; 1463 goto nomem;
1323 1464
@@ -1339,14 +1480,15 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1339 1480
1340 if (!nr_retries--) { 1481 if (!nr_retries--) {
1341 if (oom) { 1482 if (oom) {
1342 mutex_lock(&memcg_tasklist);
1343 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 1483 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
1344 mutex_unlock(&memcg_tasklist);
1345 record_last_oom(mem_over_limit); 1484 record_last_oom(mem_over_limit);
1346 } 1485 }
1347 goto nomem; 1486 goto nomem;
1348 } 1487 }
1349 } 1488 }
1489 if (csize > PAGE_SIZE)
1490 refill_stock(mem, csize - PAGE_SIZE);
1491charged:
1350 /* 1492 /*
1351 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 1493 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1352 * if they exceeds softlimit. 1494 * if they exceeds softlimit.
@@ -1361,6 +1503,21 @@ nomem:
1361} 1503}
1362 1504
1363/* 1505/*
1506 * Somemtimes we have to undo a charge we got by try_charge().
1507 * This function is for that and do uncharge, put css's refcnt.
1508 * gotten by try_charge().
1509 */
1510static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
1511{
1512 if (!mem_cgroup_is_root(mem)) {
1513 res_counter_uncharge(&mem->res, PAGE_SIZE);
1514 if (do_swap_account)
1515 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1516 }
1517 css_put(&mem->css);
1518}
1519
1520/*
1364 * A helper function to get mem_cgroup from ID. must be called under 1521 * A helper function to get mem_cgroup from ID. must be called under
1365 * rcu_read_lock(). The caller must check css_is_removed() or some if 1522 * rcu_read_lock(). The caller must check css_is_removed() or some if
1366 * it's concern. (dropping refcnt from swap can be called against removed 1523 * it's concern. (dropping refcnt from swap can be called against removed
@@ -1426,12 +1583,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1426 lock_page_cgroup(pc); 1583 lock_page_cgroup(pc);
1427 if (unlikely(PageCgroupUsed(pc))) { 1584 if (unlikely(PageCgroupUsed(pc))) {
1428 unlock_page_cgroup(pc); 1585 unlock_page_cgroup(pc);
1429 if (!mem_cgroup_is_root(mem)) { 1586 mem_cgroup_cancel_charge(mem);
1430 res_counter_uncharge(&mem->res, PAGE_SIZE);
1431 if (do_swap_account)
1432 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1433 }
1434 css_put(&mem->css);
1435 return; 1587 return;
1436 } 1588 }
1437 1589
@@ -1464,27 +1616,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1464} 1616}
1465 1617
1466/** 1618/**
1467 * mem_cgroup_move_account - move account of the page 1619 * __mem_cgroup_move_account - move account of the page
1468 * @pc: page_cgroup of the page. 1620 * @pc: page_cgroup of the page.
1469 * @from: mem_cgroup which the page is moved from. 1621 * @from: mem_cgroup which the page is moved from.
1470 * @to: mem_cgroup which the page is moved to. @from != @to. 1622 * @to: mem_cgroup which the page is moved to. @from != @to.
1471 * 1623 *
1472 * The caller must confirm following. 1624 * The caller must confirm following.
1473 * - page is not on LRU (isolate_page() is useful.) 1625 * - page is not on LRU (isolate_page() is useful.)
1474 * 1626 * - the pc is locked, used, and ->mem_cgroup points to @from.
1475 * returns 0 at success,
1476 * returns -EBUSY when lock is busy or "pc" is unstable.
1477 * 1627 *
1478 * This function does "uncharge" from old cgroup but doesn't do "charge" to 1628 * This function does "uncharge" from old cgroup but doesn't do "charge" to
1479 * new cgroup. It should be done by a caller. 1629 * new cgroup. It should be done by a caller.
1480 */ 1630 */
1481 1631
1482static int mem_cgroup_move_account(struct page_cgroup *pc, 1632static void __mem_cgroup_move_account(struct page_cgroup *pc,
1483 struct mem_cgroup *from, struct mem_cgroup *to) 1633 struct mem_cgroup *from, struct mem_cgroup *to)
1484{ 1634{
1485 struct mem_cgroup_per_zone *from_mz, *to_mz;
1486 int nid, zid;
1487 int ret = -EBUSY;
1488 struct page *page; 1635 struct page *page;
1489 int cpu; 1636 int cpu;
1490 struct mem_cgroup_stat *stat; 1637 struct mem_cgroup_stat *stat;
@@ -1492,38 +1639,27 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1492 1639
1493 VM_BUG_ON(from == to); 1640 VM_BUG_ON(from == to);
1494 VM_BUG_ON(PageLRU(pc->page)); 1641 VM_BUG_ON(PageLRU(pc->page));
1495 1642 VM_BUG_ON(!PageCgroupLocked(pc));
1496 nid = page_cgroup_nid(pc); 1643 VM_BUG_ON(!PageCgroupUsed(pc));
1497 zid = page_cgroup_zid(pc); 1644 VM_BUG_ON(pc->mem_cgroup != from);
1498 from_mz = mem_cgroup_zoneinfo(from, nid, zid);
1499 to_mz = mem_cgroup_zoneinfo(to, nid, zid);
1500
1501 if (!trylock_page_cgroup(pc))
1502 return ret;
1503
1504 if (!PageCgroupUsed(pc))
1505 goto out;
1506
1507 if (pc->mem_cgroup != from)
1508 goto out;
1509 1645
1510 if (!mem_cgroup_is_root(from)) 1646 if (!mem_cgroup_is_root(from))
1511 res_counter_uncharge(&from->res, PAGE_SIZE); 1647 res_counter_uncharge(&from->res, PAGE_SIZE);
1512 mem_cgroup_charge_statistics(from, pc, false); 1648 mem_cgroup_charge_statistics(from, pc, false);
1513 1649
1514 page = pc->page; 1650 page = pc->page;
1515 if (page_is_file_cache(page) && page_mapped(page)) { 1651 if (page_mapped(page) && !PageAnon(page)) {
1516 cpu = smp_processor_id(); 1652 cpu = smp_processor_id();
1517 /* Update mapped_file data for mem_cgroup "from" */ 1653 /* Update mapped_file data for mem_cgroup "from" */
1518 stat = &from->stat; 1654 stat = &from->stat;
1519 cpustat = &stat->cpustat[cpu]; 1655 cpustat = &stat->cpustat[cpu];
1520 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, 1656 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1521 -1); 1657 -1);
1522 1658
1523 /* Update mapped_file data for mem_cgroup "to" */ 1659 /* Update mapped_file data for mem_cgroup "to" */
1524 stat = &to->stat; 1660 stat = &to->stat;
1525 cpustat = &stat->cpustat[cpu]; 1661 cpustat = &stat->cpustat[cpu];
1526 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, 1662 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1527 1); 1663 1);
1528 } 1664 }
1529 1665
@@ -1534,15 +1670,28 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1534 css_get(&to->css); 1670 css_get(&to->css);
1535 pc->mem_cgroup = to; 1671 pc->mem_cgroup = to;
1536 mem_cgroup_charge_statistics(to, pc, true); 1672 mem_cgroup_charge_statistics(to, pc, true);
1537 ret = 0;
1538out:
1539 unlock_page_cgroup(pc);
1540 /* 1673 /*
1541 * We charges against "to" which may not have any tasks. Then, "to" 1674 * We charges against "to" which may not have any tasks. Then, "to"
1542 * can be under rmdir(). But in current implementation, caller of 1675 * can be under rmdir(). But in current implementation, caller of
1543 * this function is just force_empty() and it's garanteed that 1676 * this function is just force_empty() and it's garanteed that
1544 * "to" is never removed. So, we don't check rmdir status here. 1677 * "to" is never removed. So, we don't check rmdir status here.
1545 */ 1678 */
1679}
1680
1681/*
1682 * check whether the @pc is valid for moving account and call
1683 * __mem_cgroup_move_account()
1684 */
1685static int mem_cgroup_move_account(struct page_cgroup *pc,
1686 struct mem_cgroup *from, struct mem_cgroup *to)
1687{
1688 int ret = -EINVAL;
1689 lock_page_cgroup(pc);
1690 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
1691 __mem_cgroup_move_account(pc, from, to);
1692 ret = 0;
1693 }
1694 unlock_page_cgroup(pc);
1546 return ret; 1695 return ret;
1547} 1696}
1548 1697
@@ -1564,45 +1713,27 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1564 if (!pcg) 1713 if (!pcg)
1565 return -EINVAL; 1714 return -EINVAL;
1566 1715
1716 ret = -EBUSY;
1717 if (!get_page_unless_zero(page))
1718 goto out;
1719 if (isolate_lru_page(page))
1720 goto put;
1567 1721
1568 parent = mem_cgroup_from_cont(pcg); 1722 parent = mem_cgroup_from_cont(pcg);
1569
1570
1571 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); 1723 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
1572 if (ret || !parent) 1724 if (ret || !parent)
1573 return ret; 1725 goto put_back;
1574
1575 if (!get_page_unless_zero(page)) {
1576 ret = -EBUSY;
1577 goto uncharge;
1578 }
1579
1580 ret = isolate_lru_page(page);
1581
1582 if (ret)
1583 goto cancel;
1584 1726
1585 ret = mem_cgroup_move_account(pc, child, parent); 1727 ret = mem_cgroup_move_account(pc, child, parent);
1586 1728 if (!ret)
1729 css_put(&parent->css); /* drop extra refcnt by try_charge() */
1730 else
1731 mem_cgroup_cancel_charge(parent); /* does css_put */
1732put_back:
1587 putback_lru_page(page); 1733 putback_lru_page(page);
1588 if (!ret) { 1734put:
1589 put_page(page);
1590 /* drop extra refcnt by try_charge() */
1591 css_put(&parent->css);
1592 return 0;
1593 }
1594
1595cancel:
1596 put_page(page); 1735 put_page(page);
1597uncharge: 1736out:
1598 /* drop extra refcnt by try_charge() */
1599 css_put(&parent->css);
1600 /* uncharge if move fails */
1601 if (!mem_cgroup_is_root(parent)) {
1602 res_counter_uncharge(&parent->res, PAGE_SIZE);
1603 if (do_swap_account)
1604 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1605 }
1606 return ret; 1737 return ret;
1607} 1738}
1608 1739
@@ -1819,14 +1950,53 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1819 return; 1950 return;
1820 if (!mem) 1951 if (!mem)
1821 return; 1952 return;
1822 if (!mem_cgroup_is_root(mem)) { 1953 mem_cgroup_cancel_charge(mem);
1823 res_counter_uncharge(&mem->res, PAGE_SIZE);
1824 if (do_swap_account)
1825 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1826 }
1827 css_put(&mem->css);
1828} 1954}
1829 1955
1956static void
1957__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
1958{
1959 struct memcg_batch_info *batch = NULL;
1960 bool uncharge_memsw = true;
1961 /* If swapout, usage of swap doesn't decrease */
1962 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1963 uncharge_memsw = false;
1964 /*
1965 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
1966 * In those cases, all pages freed continously can be expected to be in
1967 * the same cgroup and we have chance to coalesce uncharges.
1968 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
1969 * because we want to do uncharge as soon as possible.
1970 */
1971 if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
1972 goto direct_uncharge;
1973
1974 batch = &current->memcg_batch;
1975 /*
1976 * In usual, we do css_get() when we remember memcg pointer.
1977 * But in this case, we keep res->usage until end of a series of
1978 * uncharges. Then, it's ok to ignore memcg's refcnt.
1979 */
1980 if (!batch->memcg)
1981 batch->memcg = mem;
1982 /*
1983 * In typical case, batch->memcg == mem. This means we can
1984 * merge a series of uncharges to an uncharge of res_counter.
1985 * If not, we uncharge res_counter ony by one.
1986 */
1987 if (batch->memcg != mem)
1988 goto direct_uncharge;
1989 /* remember freed charge and uncharge it later */
1990 batch->bytes += PAGE_SIZE;
1991 if (uncharge_memsw)
1992 batch->memsw_bytes += PAGE_SIZE;
1993 return;
1994direct_uncharge:
1995 res_counter_uncharge(&mem->res, PAGE_SIZE);
1996 if (uncharge_memsw)
1997 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1998 return;
1999}
1830 2000
1831/* 2001/*
1832 * uncharge if !page_mapped(page) 2002 * uncharge if !page_mapped(page)
@@ -1875,12 +2045,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1875 break; 2045 break;
1876 } 2046 }
1877 2047
1878 if (!mem_cgroup_is_root(mem)) { 2048 if (!mem_cgroup_is_root(mem))
1879 res_counter_uncharge(&mem->res, PAGE_SIZE); 2049 __do_uncharge(mem, ctype);
1880 if (do_swap_account &&
1881 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1882 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1883 }
1884 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2050 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1885 mem_cgroup_swap_statistics(mem, true); 2051 mem_cgroup_swap_statistics(mem, true);
1886 mem_cgroup_charge_statistics(mem, pc, false); 2052 mem_cgroup_charge_statistics(mem, pc, false);
@@ -1926,6 +2092,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
1926 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 2092 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1927} 2093}
1928 2094
2095/*
2096 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
2097 * In that cases, pages are freed continuously and we can expect pages
2098 * are in the same memcg. All these calls itself limits the number of
2099 * pages freed at once, then uncharge_start/end() is called properly.
2100 * This may be called prural(2) times in a context,
2101 */
2102
2103void mem_cgroup_uncharge_start(void)
2104{
2105 current->memcg_batch.do_batch++;
2106 /* We can do nest. */
2107 if (current->memcg_batch.do_batch == 1) {
2108 current->memcg_batch.memcg = NULL;
2109 current->memcg_batch.bytes = 0;
2110 current->memcg_batch.memsw_bytes = 0;
2111 }
2112}
2113
2114void mem_cgroup_uncharge_end(void)
2115{
2116 struct memcg_batch_info *batch = &current->memcg_batch;
2117
2118 if (!batch->do_batch)
2119 return;
2120
2121 batch->do_batch--;
2122 if (batch->do_batch) /* If stacked, do nothing. */
2123 return;
2124
2125 if (!batch->memcg)
2126 return;
2127 /*
2128 * This "batch->memcg" is valid without any css_get/put etc...
2129 * bacause we hide charges behind us.
2130 */
2131 if (batch->bytes)
2132 res_counter_uncharge(&batch->memcg->res, batch->bytes);
2133 if (batch->memsw_bytes)
2134 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2135 /* forget this pointer (for sanity check) */
2136 batch->memcg = NULL;
2137}
2138
1929#ifdef CONFIG_SWAP 2139#ifdef CONFIG_SWAP
1930/* 2140/*
1931 * called after __delete_from_swap_cache() and drop "page" account. 2141 * called after __delete_from_swap_cache() and drop "page" account.
@@ -2101,7 +2311,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2101 unsigned long long val) 2311 unsigned long long val)
2102{ 2312{
2103 int retry_count; 2313 int retry_count;
2104 int progress;
2105 u64 memswlimit; 2314 u64 memswlimit;
2106 int ret = 0; 2315 int ret = 0;
2107 int children = mem_cgroup_count_children(memcg); 2316 int children = mem_cgroup_count_children(memcg);
@@ -2145,8 +2354,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2145 if (!ret) 2354 if (!ret)
2146 break; 2355 break;
2147 2356
2148 progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, 2357 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2149 GFP_KERNEL,
2150 MEM_CGROUP_RECLAIM_SHRINK); 2358 MEM_CGROUP_RECLAIM_SHRINK);
2151 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2359 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2152 /* Usage is reduced ? */ 2360 /* Usage is reduced ? */
@@ -2385,6 +2593,7 @@ move_account:
2385 goto out; 2593 goto out;
2386 /* This is for making all *used* pages to be on LRU. */ 2594 /* This is for making all *used* pages to be on LRU. */
2387 lru_add_drain_all(); 2595 lru_add_drain_all();
2596 drain_all_stock_sync();
2388 ret = 0; 2597 ret = 0;
2389 for_each_node_state(node, N_HIGH_MEMORY) { 2598 for_each_node_state(node, N_HIGH_MEMORY) {
2390 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 2599 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
@@ -2542,6 +2751,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2542 val += idx_val; 2751 val += idx_val;
2543 mem_cgroup_get_recursive_idx_stat(mem, 2752 mem_cgroup_get_recursive_idx_stat(mem,
2544 MEM_CGROUP_STAT_SWAPOUT, &idx_val); 2753 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2754 val += idx_val;
2545 val <<= PAGE_SHIFT; 2755 val <<= PAGE_SHIFT;
2546 } else 2756 } else
2547 val = res_counter_read_u64(&mem->memsw, name); 2757 val = res_counter_read_u64(&mem->memsw, name);
@@ -2661,7 +2871,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2661enum { 2871enum {
2662 MCS_CACHE, 2872 MCS_CACHE,
2663 MCS_RSS, 2873 MCS_RSS,
2664 MCS_MAPPED_FILE, 2874 MCS_FILE_MAPPED,
2665 MCS_PGPGIN, 2875 MCS_PGPGIN,
2666 MCS_PGPGOUT, 2876 MCS_PGPGOUT,
2667 MCS_SWAP, 2877 MCS_SWAP,
@@ -2705,8 +2915,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2705 s->stat[MCS_CACHE] += val * PAGE_SIZE; 2915 s->stat[MCS_CACHE] += val * PAGE_SIZE;
2706 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 2916 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
2707 s->stat[MCS_RSS] += val * PAGE_SIZE; 2917 s->stat[MCS_RSS] += val * PAGE_SIZE;
2708 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); 2918 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
2709 s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; 2919 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
2710 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); 2920 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
2711 s->stat[MCS_PGPGIN] += val; 2921 s->stat[MCS_PGPGIN] += val;
2712 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 2922 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
@@ -3098,11 +3308,18 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3098 3308
3099 /* root ? */ 3309 /* root ? */
3100 if (cont->parent == NULL) { 3310 if (cont->parent == NULL) {
3311 int cpu;
3101 enable_swap_cgroup(); 3312 enable_swap_cgroup();
3102 parent = NULL; 3313 parent = NULL;
3103 root_mem_cgroup = mem; 3314 root_mem_cgroup = mem;
3104 if (mem_cgroup_soft_limit_tree_init()) 3315 if (mem_cgroup_soft_limit_tree_init())
3105 goto free_out; 3316 goto free_out;
3317 for_each_possible_cpu(cpu) {
3318 struct memcg_stock_pcp *stock =
3319 &per_cpu(memcg_stock, cpu);
3320 INIT_WORK(&stock->work, drain_local_stock);
3321 }
3322 hotcpu_notifier(memcg_stock_cpu_callback, 0);
3106 3323
3107 } else { 3324 } else {
3108 parent = mem_cgroup_from_cont(cont->parent); 3325 parent = mem_cgroup_from_cont(cont->parent);
@@ -3171,12 +3388,10 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
3171 struct task_struct *p, 3388 struct task_struct *p,
3172 bool threadgroup) 3389 bool threadgroup)
3173{ 3390{
3174 mutex_lock(&memcg_tasklist);
3175 /* 3391 /*
3176 * FIXME: It's better to move charges of this process from old 3392 * FIXME: It's better to move charges of this process from old
3177 * memcg to new memcg. But it's just on TODO-List now. 3393 * memcg to new memcg. But it's just on TODO-List now.
3178 */ 3394 */
3179 mutex_unlock(&memcg_tasklist);
3180} 3395}
3181 3396
3182struct cgroup_subsys mem_cgroup_subsys = { 3397struct cgroup_subsys mem_cgroup_subsys = {