aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c453
1 files changed, 334 insertions, 119 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c31a310aa146..954032b80bed 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -38,6 +38,7 @@
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/mm_inline.h> 39#include <linux/mm_inline.h>
40#include <linux/page_cgroup.h> 40#include <linux/page_cgroup.h>
41#include <linux/cpu.h>
41#include "internal.h" 42#include "internal.h"
42 43
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
@@ -54,7 +55,6 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
54#define do_swap_account (0) 55#define do_swap_account (0)
55#endif 56#endif
56 57
57static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
58#define SOFTLIMIT_EVENTS_THRESH (1000) 58#define SOFTLIMIT_EVENTS_THRESH (1000)
59 59
60/* 60/*
@@ -66,7 +66,7 @@ enum mem_cgroup_stat_index {
66 */ 66 */
67 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 67 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
68 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 68 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
69 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ 69 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ 72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
@@ -275,6 +275,7 @@ enum charge_type {
275static void mem_cgroup_get(struct mem_cgroup *mem); 275static void mem_cgroup_get(struct mem_cgroup *mem);
276static void mem_cgroup_put(struct mem_cgroup *mem); 276static void mem_cgroup_put(struct mem_cgroup *mem);
277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
278static void drain_all_stock_async(void);
278 279
279static struct mem_cgroup_per_zone * 280static struct mem_cgroup_per_zone *
280mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 281mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -282,6 +283,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
282 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 283 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
283} 284}
284 285
286struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
287{
288 return &mem->css;
289}
290
285static struct mem_cgroup_per_zone * 291static struct mem_cgroup_per_zone *
286page_cgroup_zoneinfo(struct page_cgroup *pc) 292page_cgroup_zoneinfo(struct page_cgroup *pc)
287{ 293{
@@ -758,7 +764,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
758 task_unlock(task); 764 task_unlock(task);
759 if (!curr) 765 if (!curr)
760 return 0; 766 return 0;
761 if (curr->use_hierarchy) 767 /*
768 * We should check use_hierarchy of "mem" not "curr". Because checking
769 * use_hierarchy of "curr" here make this function true if hierarchy is
770 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
771 * hierarchy(even if use_hierarchy is disabled in "mem").
772 */
773 if (mem->use_hierarchy)
762 ret = css_is_ancestor(&curr->css, &mem->css); 774 ret = css_is_ancestor(&curr->css, &mem->css);
763 else 775 else
764 ret = (curr == mem); 776 ret = (curr == mem);
@@ -1007,7 +1019,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1007 static char memcg_name[PATH_MAX]; 1019 static char memcg_name[PATH_MAX];
1008 int ret; 1020 int ret;
1009 1021
1010 if (!memcg) 1022 if (!memcg || !p)
1011 return; 1023 return;
1012 1024
1013 1025
@@ -1137,6 +1149,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1137 victim = mem_cgroup_select_victim(root_mem); 1149 victim = mem_cgroup_select_victim(root_mem);
1138 if (victim == root_mem) { 1150 if (victim == root_mem) {
1139 loop++; 1151 loop++;
1152 if (loop >= 1)
1153 drain_all_stock_async();
1140 if (loop >= 2) { 1154 if (loop >= 2) {
1141 /* 1155 /*
1142 * If we have not been able to reclaim 1156 * If we have not been able to reclaim
@@ -1223,7 +1237,7 @@ static void record_last_oom(struct mem_cgroup *mem)
1223 * Currently used to update mapped file statistics, but the routine can be 1237 * Currently used to update mapped file statistics, but the routine can be
1224 * generalized to update other statistics as well. 1238 * generalized to update other statistics as well.
1225 */ 1239 */
1226void mem_cgroup_update_mapped_file_stat(struct page *page, int val) 1240void mem_cgroup_update_file_mapped(struct page *page, int val)
1227{ 1241{
1228 struct mem_cgroup *mem; 1242 struct mem_cgroup *mem;
1229 struct mem_cgroup_stat *stat; 1243 struct mem_cgroup_stat *stat;
@@ -1231,9 +1245,6 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
1231 int cpu; 1245 int cpu;
1232 struct page_cgroup *pc; 1246 struct page_cgroup *pc;
1233 1247
1234 if (!page_is_file_cache(page))
1235 return;
1236
1237 pc = lookup_page_cgroup(page); 1248 pc = lookup_page_cgroup(page);
1238 if (unlikely(!pc)) 1249 if (unlikely(!pc))
1239 return; 1250 return;
@@ -1253,12 +1264,139 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
1253 stat = &mem->stat; 1264 stat = &mem->stat;
1254 cpustat = &stat->cpustat[cpu]; 1265 cpustat = &stat->cpustat[cpu];
1255 1266
1256 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); 1267 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
1257done: 1268done:
1258 unlock_page_cgroup(pc); 1269 unlock_page_cgroup(pc);
1259} 1270}
1260 1271
1261/* 1272/*
1273 * size of first charge trial. "32" comes from vmscan.c's magic value.
1274 * TODO: maybe necessary to use big numbers in big irons.
1275 */
1276#define CHARGE_SIZE (32 * PAGE_SIZE)
1277struct memcg_stock_pcp {
1278 struct mem_cgroup *cached; /* this never be root cgroup */
1279 int charge;
1280 struct work_struct work;
1281};
1282static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1283static atomic_t memcg_drain_count;
1284
1285/*
1286 * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
1287 * from local stock and true is returned. If the stock is 0 or charges from a
1288 * cgroup which is not current target, returns false. This stock will be
1289 * refilled.
1290 */
1291static bool consume_stock(struct mem_cgroup *mem)
1292{
1293 struct memcg_stock_pcp *stock;
1294 bool ret = true;
1295
1296 stock = &get_cpu_var(memcg_stock);
1297 if (mem == stock->cached && stock->charge)
1298 stock->charge -= PAGE_SIZE;
1299 else /* need to call res_counter_charge */
1300 ret = false;
1301 put_cpu_var(memcg_stock);
1302 return ret;
1303}
1304
1305/*
1306 * Returns stocks cached in percpu to res_counter and reset cached information.
1307 */
1308static void drain_stock(struct memcg_stock_pcp *stock)
1309{
1310 struct mem_cgroup *old = stock->cached;
1311
1312 if (stock->charge) {
1313 res_counter_uncharge(&old->res, stock->charge);
1314 if (do_swap_account)
1315 res_counter_uncharge(&old->memsw, stock->charge);
1316 }
1317 stock->cached = NULL;
1318 stock->charge = 0;
1319}
1320
1321/*
1322 * This must be called under preempt disabled or must be called by
1323 * a thread which is pinned to local cpu.
1324 */
1325static void drain_local_stock(struct work_struct *dummy)
1326{
1327 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1328 drain_stock(stock);
1329}
1330
1331/*
1332 * Cache charges(val) which is from res_counter, to local per_cpu area.
1333 * This will be consumed by consumt_stock() function, later.
1334 */
1335static void refill_stock(struct mem_cgroup *mem, int val)
1336{
1337 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1338
1339 if (stock->cached != mem) { /* reset if necessary */
1340 drain_stock(stock);
1341 stock->cached = mem;
1342 }
1343 stock->charge += val;
1344 put_cpu_var(memcg_stock);
1345}
1346
1347/*
1348 * Tries to drain stocked charges in other cpus. This function is asynchronous
1349 * and just put a work per cpu for draining localy on each cpu. Caller can
1350 * expects some charges will be back to res_counter later but cannot wait for
1351 * it.
1352 */
1353static void drain_all_stock_async(void)
1354{
1355 int cpu;
1356 /* This function is for scheduling "drain" in asynchronous way.
1357 * The result of "drain" is not directly handled by callers. Then,
1358 * if someone is calling drain, we don't have to call drain more.
1359 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
1360 * there is a race. We just do loose check here.
1361 */
1362 if (atomic_read(&memcg_drain_count))
1363 return;
1364 /* Notify other cpus that system-wide "drain" is running */
1365 atomic_inc(&memcg_drain_count);
1366 get_online_cpus();
1367 for_each_online_cpu(cpu) {
1368 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1369 schedule_work_on(cpu, &stock->work);
1370 }
1371 put_online_cpus();
1372 atomic_dec(&memcg_drain_count);
1373 /* We don't wait for flush_work */
1374}
1375
1376/* This is a synchronous drain interface. */
1377static void drain_all_stock_sync(void)
1378{
1379 /* called when force_empty is called */
1380 atomic_inc(&memcg_drain_count);
1381 schedule_on_each_cpu(drain_local_stock);
1382 atomic_dec(&memcg_drain_count);
1383}
1384
1385static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1386 unsigned long action,
1387 void *hcpu)
1388{
1389 int cpu = (unsigned long)hcpu;
1390 struct memcg_stock_pcp *stock;
1391
1392 if (action != CPU_DEAD)
1393 return NOTIFY_OK;
1394 stock = &per_cpu(memcg_stock, cpu);
1395 drain_stock(stock);
1396 return NOTIFY_OK;
1397}
1398
1399/*
1262 * Unlike exported interface, "oom" parameter is added. if oom==true, 1400 * Unlike exported interface, "oom" parameter is added. if oom==true,
1263 * oom-killer can be invoked. 1401 * oom-killer can be invoked.
1264 */ 1402 */
@@ -1269,6 +1407,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1269 struct mem_cgroup *mem, *mem_over_limit; 1407 struct mem_cgroup *mem, *mem_over_limit;
1270 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1408 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1271 struct res_counter *fail_res; 1409 struct res_counter *fail_res;
1410 int csize = CHARGE_SIZE;
1272 1411
1273 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1412 if (unlikely(test_thread_flag(TIF_MEMDIE))) {
1274 /* Don't account this! */ 1413 /* Don't account this! */
@@ -1293,23 +1432,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1293 return 0; 1432 return 0;
1294 1433
1295 VM_BUG_ON(css_is_removed(&mem->css)); 1434 VM_BUG_ON(css_is_removed(&mem->css));
1435 if (mem_cgroup_is_root(mem))
1436 goto done;
1296 1437
1297 while (1) { 1438 while (1) {
1298 int ret = 0; 1439 int ret = 0;
1299 unsigned long flags = 0; 1440 unsigned long flags = 0;
1300 1441
1301 if (mem_cgroup_is_root(mem)) 1442 if (consume_stock(mem))
1302 goto done; 1443 goto charged;
1303 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 1444
1445 ret = res_counter_charge(&mem->res, csize, &fail_res);
1304 if (likely(!ret)) { 1446 if (likely(!ret)) {
1305 if (!do_swap_account) 1447 if (!do_swap_account)
1306 break; 1448 break;
1307 ret = res_counter_charge(&mem->memsw, PAGE_SIZE, 1449 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1308 &fail_res);
1309 if (likely(!ret)) 1450 if (likely(!ret))
1310 break; 1451 break;
1311 /* mem+swap counter fails */ 1452 /* mem+swap counter fails */
1312 res_counter_uncharge(&mem->res, PAGE_SIZE); 1453 res_counter_uncharge(&mem->res, csize);
1313 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1454 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1314 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1455 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1315 memsw); 1456 memsw);
@@ -1318,6 +1459,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1318 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1459 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1319 res); 1460 res);
1320 1461
1462 /* reduce request size and retry */
1463 if (csize > PAGE_SIZE) {
1464 csize = PAGE_SIZE;
1465 continue;
1466 }
1321 if (!(gfp_mask & __GFP_WAIT)) 1467 if (!(gfp_mask & __GFP_WAIT))
1322 goto nomem; 1468 goto nomem;
1323 1469
@@ -1339,14 +1485,15 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1339 1485
1340 if (!nr_retries--) { 1486 if (!nr_retries--) {
1341 if (oom) { 1487 if (oom) {
1342 mutex_lock(&memcg_tasklist);
1343 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 1488 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
1344 mutex_unlock(&memcg_tasklist);
1345 record_last_oom(mem_over_limit); 1489 record_last_oom(mem_over_limit);
1346 } 1490 }
1347 goto nomem; 1491 goto nomem;
1348 } 1492 }
1349 } 1493 }
1494 if (csize > PAGE_SIZE)
1495 refill_stock(mem, csize - PAGE_SIZE);
1496charged:
1350 /* 1497 /*
1351 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 1498 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1352 * if they exceeds softlimit. 1499 * if they exceeds softlimit.
@@ -1361,6 +1508,21 @@ nomem:
1361} 1508}
1362 1509
1363/* 1510/*
1511 * Somemtimes we have to undo a charge we got by try_charge().
1512 * This function is for that and do uncharge, put css's refcnt.
1513 * gotten by try_charge().
1514 */
1515static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
1516{
1517 if (!mem_cgroup_is_root(mem)) {
1518 res_counter_uncharge(&mem->res, PAGE_SIZE);
1519 if (do_swap_account)
1520 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1521 }
1522 css_put(&mem->css);
1523}
1524
1525/*
1364 * A helper function to get mem_cgroup from ID. must be called under 1526 * A helper function to get mem_cgroup from ID. must be called under
1365 * rcu_read_lock(). The caller must check css_is_removed() or some if 1527 * rcu_read_lock(). The caller must check css_is_removed() or some if
1366 * it's concern. (dropping refcnt from swap can be called against removed 1528 * it's concern. (dropping refcnt from swap can be called against removed
@@ -1379,25 +1541,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1379 return container_of(css, struct mem_cgroup, css); 1541 return container_of(css, struct mem_cgroup, css);
1380} 1542}
1381 1543
1382static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) 1544struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
1383{ 1545{
1384 struct mem_cgroup *mem; 1546 struct mem_cgroup *mem = NULL;
1385 struct page_cgroup *pc; 1547 struct page_cgroup *pc;
1386 unsigned short id; 1548 unsigned short id;
1387 swp_entry_t ent; 1549 swp_entry_t ent;
1388 1550
1389 VM_BUG_ON(!PageLocked(page)); 1551 VM_BUG_ON(!PageLocked(page));
1390 1552
1391 if (!PageSwapCache(page))
1392 return NULL;
1393
1394 pc = lookup_page_cgroup(page); 1553 pc = lookup_page_cgroup(page);
1395 lock_page_cgroup(pc); 1554 lock_page_cgroup(pc);
1396 if (PageCgroupUsed(pc)) { 1555 if (PageCgroupUsed(pc)) {
1397 mem = pc->mem_cgroup; 1556 mem = pc->mem_cgroup;
1398 if (mem && !css_tryget(&mem->css)) 1557 if (mem && !css_tryget(&mem->css))
1399 mem = NULL; 1558 mem = NULL;
1400 } else { 1559 } else if (PageSwapCache(page)) {
1401 ent.val = page_private(page); 1560 ent.val = page_private(page);
1402 id = lookup_swap_cgroup(ent); 1561 id = lookup_swap_cgroup(ent);
1403 rcu_read_lock(); 1562 rcu_read_lock();
@@ -1426,12 +1585,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1426 lock_page_cgroup(pc); 1585 lock_page_cgroup(pc);
1427 if (unlikely(PageCgroupUsed(pc))) { 1586 if (unlikely(PageCgroupUsed(pc))) {
1428 unlock_page_cgroup(pc); 1587 unlock_page_cgroup(pc);
1429 if (!mem_cgroup_is_root(mem)) { 1588 mem_cgroup_cancel_charge(mem);
1430 res_counter_uncharge(&mem->res, PAGE_SIZE);
1431 if (do_swap_account)
1432 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1433 }
1434 css_put(&mem->css);
1435 return; 1589 return;
1436 } 1590 }
1437 1591
@@ -1464,27 +1618,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1464} 1618}
1465 1619
1466/** 1620/**
1467 * mem_cgroup_move_account - move account of the page 1621 * __mem_cgroup_move_account - move account of the page
1468 * @pc: page_cgroup of the page. 1622 * @pc: page_cgroup of the page.
1469 * @from: mem_cgroup which the page is moved from. 1623 * @from: mem_cgroup which the page is moved from.
1470 * @to: mem_cgroup which the page is moved to. @from != @to. 1624 * @to: mem_cgroup which the page is moved to. @from != @to.
1471 * 1625 *
1472 * The caller must confirm following. 1626 * The caller must confirm following.
1473 * - page is not on LRU (isolate_page() is useful.) 1627 * - page is not on LRU (isolate_page() is useful.)
1474 * 1628 * - the pc is locked, used, and ->mem_cgroup points to @from.
1475 * returns 0 at success,
1476 * returns -EBUSY when lock is busy or "pc" is unstable.
1477 * 1629 *
1478 * This function does "uncharge" from old cgroup but doesn't do "charge" to 1630 * This function does "uncharge" from old cgroup but doesn't do "charge" to
1479 * new cgroup. It should be done by a caller. 1631 * new cgroup. It should be done by a caller.
1480 */ 1632 */
1481 1633
1482static int mem_cgroup_move_account(struct page_cgroup *pc, 1634static void __mem_cgroup_move_account(struct page_cgroup *pc,
1483 struct mem_cgroup *from, struct mem_cgroup *to) 1635 struct mem_cgroup *from, struct mem_cgroup *to)
1484{ 1636{
1485 struct mem_cgroup_per_zone *from_mz, *to_mz;
1486 int nid, zid;
1487 int ret = -EBUSY;
1488 struct page *page; 1637 struct page *page;
1489 int cpu; 1638 int cpu;
1490 struct mem_cgroup_stat *stat; 1639 struct mem_cgroup_stat *stat;
@@ -1492,38 +1641,27 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1492 1641
1493 VM_BUG_ON(from == to); 1642 VM_BUG_ON(from == to);
1494 VM_BUG_ON(PageLRU(pc->page)); 1643 VM_BUG_ON(PageLRU(pc->page));
1495 1644 VM_BUG_ON(!PageCgroupLocked(pc));
1496 nid = page_cgroup_nid(pc); 1645 VM_BUG_ON(!PageCgroupUsed(pc));
1497 zid = page_cgroup_zid(pc); 1646 VM_BUG_ON(pc->mem_cgroup != from);
1498 from_mz = mem_cgroup_zoneinfo(from, nid, zid);
1499 to_mz = mem_cgroup_zoneinfo(to, nid, zid);
1500
1501 if (!trylock_page_cgroup(pc))
1502 return ret;
1503
1504 if (!PageCgroupUsed(pc))
1505 goto out;
1506
1507 if (pc->mem_cgroup != from)
1508 goto out;
1509 1647
1510 if (!mem_cgroup_is_root(from)) 1648 if (!mem_cgroup_is_root(from))
1511 res_counter_uncharge(&from->res, PAGE_SIZE); 1649 res_counter_uncharge(&from->res, PAGE_SIZE);
1512 mem_cgroup_charge_statistics(from, pc, false); 1650 mem_cgroup_charge_statistics(from, pc, false);
1513 1651
1514 page = pc->page; 1652 page = pc->page;
1515 if (page_is_file_cache(page) && page_mapped(page)) { 1653 if (page_mapped(page) && !PageAnon(page)) {
1516 cpu = smp_processor_id(); 1654 cpu = smp_processor_id();
1517 /* Update mapped_file data for mem_cgroup "from" */ 1655 /* Update mapped_file data for mem_cgroup "from" */
1518 stat = &from->stat; 1656 stat = &from->stat;
1519 cpustat = &stat->cpustat[cpu]; 1657 cpustat = &stat->cpustat[cpu];
1520 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, 1658 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1521 -1); 1659 -1);
1522 1660
1523 /* Update mapped_file data for mem_cgroup "to" */ 1661 /* Update mapped_file data for mem_cgroup "to" */
1524 stat = &to->stat; 1662 stat = &to->stat;
1525 cpustat = &stat->cpustat[cpu]; 1663 cpustat = &stat->cpustat[cpu];
1526 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, 1664 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1527 1); 1665 1);
1528 } 1666 }
1529 1667
@@ -1534,15 +1672,28 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1534 css_get(&to->css); 1672 css_get(&to->css);
1535 pc->mem_cgroup = to; 1673 pc->mem_cgroup = to;
1536 mem_cgroup_charge_statistics(to, pc, true); 1674 mem_cgroup_charge_statistics(to, pc, true);
1537 ret = 0;
1538out:
1539 unlock_page_cgroup(pc);
1540 /* 1675 /*
1541 * We charges against "to" which may not have any tasks. Then, "to" 1676 * We charges against "to" which may not have any tasks. Then, "to"
1542 * can be under rmdir(). But in current implementation, caller of 1677 * can be under rmdir(). But in current implementation, caller of
1543 * this function is just force_empty() and it's garanteed that 1678 * this function is just force_empty() and it's garanteed that
1544 * "to" is never removed. So, we don't check rmdir status here. 1679 * "to" is never removed. So, we don't check rmdir status here.
1545 */ 1680 */
1681}
1682
1683/*
1684 * check whether the @pc is valid for moving account and call
1685 * __mem_cgroup_move_account()
1686 */
1687static int mem_cgroup_move_account(struct page_cgroup *pc,
1688 struct mem_cgroup *from, struct mem_cgroup *to)
1689{
1690 int ret = -EINVAL;
1691 lock_page_cgroup(pc);
1692 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
1693 __mem_cgroup_move_account(pc, from, to);
1694 ret = 0;
1695 }
1696 unlock_page_cgroup(pc);
1546 return ret; 1697 return ret;
1547} 1698}
1548 1699
@@ -1564,45 +1715,27 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1564 if (!pcg) 1715 if (!pcg)
1565 return -EINVAL; 1716 return -EINVAL;
1566 1717
1718 ret = -EBUSY;
1719 if (!get_page_unless_zero(page))
1720 goto out;
1721 if (isolate_lru_page(page))
1722 goto put;
1567 1723
1568 parent = mem_cgroup_from_cont(pcg); 1724 parent = mem_cgroup_from_cont(pcg);
1569
1570
1571 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); 1725 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
1572 if (ret || !parent) 1726 if (ret || !parent)
1573 return ret; 1727 goto put_back;
1574
1575 if (!get_page_unless_zero(page)) {
1576 ret = -EBUSY;
1577 goto uncharge;
1578 }
1579
1580 ret = isolate_lru_page(page);
1581
1582 if (ret)
1583 goto cancel;
1584 1728
1585 ret = mem_cgroup_move_account(pc, child, parent); 1729 ret = mem_cgroup_move_account(pc, child, parent);
1586 1730 if (!ret)
1731 css_put(&parent->css); /* drop extra refcnt by try_charge() */
1732 else
1733 mem_cgroup_cancel_charge(parent); /* does css_put */
1734put_back:
1587 putback_lru_page(page); 1735 putback_lru_page(page);
1588 if (!ret) { 1736put:
1589 put_page(page);
1590 /* drop extra refcnt by try_charge() */
1591 css_put(&parent->css);
1592 return 0;
1593 }
1594
1595cancel:
1596 put_page(page); 1737 put_page(page);
1597uncharge: 1738out:
1598 /* drop extra refcnt by try_charge() */
1599 css_put(&parent->css);
1600 /* uncharge if move fails */
1601 if (!mem_cgroup_is_root(parent)) {
1602 res_counter_uncharge(&parent->res, PAGE_SIZE);
1603 if (do_swap_account)
1604 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1605 }
1606 return ret; 1739 return ret;
1607} 1740}
1608 1741
@@ -1737,12 +1870,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1737 goto charge_cur_mm; 1870 goto charge_cur_mm;
1738 /* 1871 /*
1739 * A racing thread's fault, or swapoff, may have already updated 1872 * A racing thread's fault, or swapoff, may have already updated
1740 * the pte, and even removed page from swap cache: return success 1873 * the pte, and even removed page from swap cache: in those cases
1741 * to go on to do_swap_page()'s pte_same() test, which should fail. 1874 * do_swap_page()'s pte_same() test will fail; but there's also a
1875 * KSM case which does need to charge the page.
1742 */ 1876 */
1743 if (!PageSwapCache(page)) 1877 if (!PageSwapCache(page))
1744 return 0; 1878 goto charge_cur_mm;
1745 mem = try_get_mem_cgroup_from_swapcache(page); 1879 mem = try_get_mem_cgroup_from_page(page);
1746 if (!mem) 1880 if (!mem)
1747 goto charge_cur_mm; 1881 goto charge_cur_mm;
1748 *ptr = mem; 1882 *ptr = mem;
@@ -1818,14 +1952,53 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1818 return; 1952 return;
1819 if (!mem) 1953 if (!mem)
1820 return; 1954 return;
1821 if (!mem_cgroup_is_root(mem)) { 1955 mem_cgroup_cancel_charge(mem);
1822 res_counter_uncharge(&mem->res, PAGE_SIZE);
1823 if (do_swap_account)
1824 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1825 }
1826 css_put(&mem->css);
1827} 1956}
1828 1957
1958static void
1959__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
1960{
1961 struct memcg_batch_info *batch = NULL;
1962 bool uncharge_memsw = true;
1963 /* If swapout, usage of swap doesn't decrease */
1964 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1965 uncharge_memsw = false;
1966 /*
1967 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
1968 * In those cases, all pages freed continously can be expected to be in
1969 * the same cgroup and we have chance to coalesce uncharges.
1970 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
1971 * because we want to do uncharge as soon as possible.
1972 */
1973 if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
1974 goto direct_uncharge;
1975
1976 batch = &current->memcg_batch;
1977 /*
1978 * In usual, we do css_get() when we remember memcg pointer.
1979 * But in this case, we keep res->usage until end of a series of
1980 * uncharges. Then, it's ok to ignore memcg's refcnt.
1981 */
1982 if (!batch->memcg)
1983 batch->memcg = mem;
1984 /*
1985 * In typical case, batch->memcg == mem. This means we can
1986 * merge a series of uncharges to an uncharge of res_counter.
1987 * If not, we uncharge res_counter ony by one.
1988 */
1989 if (batch->memcg != mem)
1990 goto direct_uncharge;
1991 /* remember freed charge and uncharge it later */
1992 batch->bytes += PAGE_SIZE;
1993 if (uncharge_memsw)
1994 batch->memsw_bytes += PAGE_SIZE;
1995 return;
1996direct_uncharge:
1997 res_counter_uncharge(&mem->res, PAGE_SIZE);
1998 if (uncharge_memsw)
1999 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
2000 return;
2001}
1829 2002
1830/* 2003/*
1831 * uncharge if !page_mapped(page) 2004 * uncharge if !page_mapped(page)
@@ -1874,12 +2047,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1874 break; 2047 break;
1875 } 2048 }
1876 2049
1877 if (!mem_cgroup_is_root(mem)) { 2050 if (!mem_cgroup_is_root(mem))
1878 res_counter_uncharge(&mem->res, PAGE_SIZE); 2051 __do_uncharge(mem, ctype);
1879 if (do_swap_account &&
1880 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1881 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1882 }
1883 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2052 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1884 mem_cgroup_swap_statistics(mem, true); 2053 mem_cgroup_swap_statistics(mem, true);
1885 mem_cgroup_charge_statistics(mem, pc, false); 2054 mem_cgroup_charge_statistics(mem, pc, false);
@@ -1925,6 +2094,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
1925 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 2094 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1926} 2095}
1927 2096
2097/*
2098 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
2099 * In that cases, pages are freed continuously and we can expect pages
2100 * are in the same memcg. All these calls itself limits the number of
2101 * pages freed at once, then uncharge_start/end() is called properly.
2102 * This may be called prural(2) times in a context,
2103 */
2104
2105void mem_cgroup_uncharge_start(void)
2106{
2107 current->memcg_batch.do_batch++;
2108 /* We can do nest. */
2109 if (current->memcg_batch.do_batch == 1) {
2110 current->memcg_batch.memcg = NULL;
2111 current->memcg_batch.bytes = 0;
2112 current->memcg_batch.memsw_bytes = 0;
2113 }
2114}
2115
2116void mem_cgroup_uncharge_end(void)
2117{
2118 struct memcg_batch_info *batch = &current->memcg_batch;
2119
2120 if (!batch->do_batch)
2121 return;
2122
2123 batch->do_batch--;
2124 if (batch->do_batch) /* If stacked, do nothing. */
2125 return;
2126
2127 if (!batch->memcg)
2128 return;
2129 /*
2130 * This "batch->memcg" is valid without any css_get/put etc...
2131 * bacause we hide charges behind us.
2132 */
2133 if (batch->bytes)
2134 res_counter_uncharge(&batch->memcg->res, batch->bytes);
2135 if (batch->memsw_bytes)
2136 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2137 /* forget this pointer (for sanity check) */
2138 batch->memcg = NULL;
2139}
2140
1928#ifdef CONFIG_SWAP 2141#ifdef CONFIG_SWAP
1929/* 2142/*
1930 * called after __delete_from_swap_cache() and drop "page" account. 2143 * called after __delete_from_swap_cache() and drop "page" account.
@@ -2100,7 +2313,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2100 unsigned long long val) 2313 unsigned long long val)
2101{ 2314{
2102 int retry_count; 2315 int retry_count;
2103 int progress;
2104 u64 memswlimit; 2316 u64 memswlimit;
2105 int ret = 0; 2317 int ret = 0;
2106 int children = mem_cgroup_count_children(memcg); 2318 int children = mem_cgroup_count_children(memcg);
@@ -2144,8 +2356,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2144 if (!ret) 2356 if (!ret)
2145 break; 2357 break;
2146 2358
2147 progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, 2359 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2148 GFP_KERNEL,
2149 MEM_CGROUP_RECLAIM_SHRINK); 2360 MEM_CGROUP_RECLAIM_SHRINK);
2150 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2361 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2151 /* Usage is reduced ? */ 2362 /* Usage is reduced ? */
@@ -2375,7 +2586,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
2375 if (free_all) 2586 if (free_all)
2376 goto try_to_free; 2587 goto try_to_free;
2377move_account: 2588move_account:
2378 while (mem->res.usage > 0) { 2589 do {
2379 ret = -EBUSY; 2590 ret = -EBUSY;
2380 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 2591 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
2381 goto out; 2592 goto out;
@@ -2384,6 +2595,7 @@ move_account:
2384 goto out; 2595 goto out;
2385 /* This is for making all *used* pages to be on LRU. */ 2596 /* This is for making all *used* pages to be on LRU. */
2386 lru_add_drain_all(); 2597 lru_add_drain_all();
2598 drain_all_stock_sync();
2387 ret = 0; 2599 ret = 0;
2388 for_each_node_state(node, N_HIGH_MEMORY) { 2600 for_each_node_state(node, N_HIGH_MEMORY) {
2389 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 2601 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
@@ -2402,8 +2614,8 @@ move_account:
2402 if (ret == -ENOMEM) 2614 if (ret == -ENOMEM)
2403 goto try_to_free; 2615 goto try_to_free;
2404 cond_resched(); 2616 cond_resched();
2405 } 2617 /* "ret" should also be checked to ensure all lists are empty. */
2406 ret = 0; 2618 } while (mem->res.usage > 0 || ret);
2407out: 2619out:
2408 css_put(&mem->css); 2620 css_put(&mem->css);
2409 return ret; 2621 return ret;
@@ -2436,10 +2648,7 @@ try_to_free:
2436 } 2648 }
2437 lru_add_drain(); 2649 lru_add_drain();
2438 /* try move_account...there may be some *locked* pages. */ 2650 /* try move_account...there may be some *locked* pages. */
2439 if (mem->res.usage) 2651 goto move_account;
2440 goto move_account;
2441 ret = 0;
2442 goto out;
2443} 2652}
2444 2653
2445int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 2654int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
@@ -2541,6 +2750,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2541 val += idx_val; 2750 val += idx_val;
2542 mem_cgroup_get_recursive_idx_stat(mem, 2751 mem_cgroup_get_recursive_idx_stat(mem,
2543 MEM_CGROUP_STAT_SWAPOUT, &idx_val); 2752 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2753 val += idx_val;
2544 val <<= PAGE_SHIFT; 2754 val <<= PAGE_SHIFT;
2545 } else 2755 } else
2546 val = res_counter_read_u64(&mem->memsw, name); 2756 val = res_counter_read_u64(&mem->memsw, name);
@@ -2660,7 +2870,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2660enum { 2870enum {
2661 MCS_CACHE, 2871 MCS_CACHE,
2662 MCS_RSS, 2872 MCS_RSS,
2663 MCS_MAPPED_FILE, 2873 MCS_FILE_MAPPED,
2664 MCS_PGPGIN, 2874 MCS_PGPGIN,
2665 MCS_PGPGOUT, 2875 MCS_PGPGOUT,
2666 MCS_SWAP, 2876 MCS_SWAP,
@@ -2704,8 +2914,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2704 s->stat[MCS_CACHE] += val * PAGE_SIZE; 2914 s->stat[MCS_CACHE] += val * PAGE_SIZE;
2705 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 2915 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
2706 s->stat[MCS_RSS] += val * PAGE_SIZE; 2916 s->stat[MCS_RSS] += val * PAGE_SIZE;
2707 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); 2917 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
2708 s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; 2918 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
2709 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); 2919 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
2710 s->stat[MCS_PGPGIN] += val; 2920 s->stat[MCS_PGPGIN] += val;
2711 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 2921 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
@@ -3097,11 +3307,18 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3097 3307
3098 /* root ? */ 3308 /* root ? */
3099 if (cont->parent == NULL) { 3309 if (cont->parent == NULL) {
3310 int cpu;
3100 enable_swap_cgroup(); 3311 enable_swap_cgroup();
3101 parent = NULL; 3312 parent = NULL;
3102 root_mem_cgroup = mem; 3313 root_mem_cgroup = mem;
3103 if (mem_cgroup_soft_limit_tree_init()) 3314 if (mem_cgroup_soft_limit_tree_init())
3104 goto free_out; 3315 goto free_out;
3316 for_each_possible_cpu(cpu) {
3317 struct memcg_stock_pcp *stock =
3318 &per_cpu(memcg_stock, cpu);
3319 INIT_WORK(&stock->work, drain_local_stock);
3320 }
3321 hotcpu_notifier(memcg_stock_cpu_callback, 0);
3105 3322
3106 } else { 3323 } else {
3107 parent = mem_cgroup_from_cont(cont->parent); 3324 parent = mem_cgroup_from_cont(cont->parent);
@@ -3170,12 +3387,10 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
3170 struct task_struct *p, 3387 struct task_struct *p,
3171 bool threadgroup) 3388 bool threadgroup)
3172{ 3389{
3173 mutex_lock(&memcg_tasklist);
3174 /* 3390 /*
3175 * FIXME: It's better to move charges of this process from old 3391 * FIXME: It's better to move charges of this process from old
3176 * memcg to new memcg. But it's just on TODO-List now. 3392 * memcg to new memcg. But it's just on TODO-List now.
3177 */ 3393 */
3178 mutex_unlock(&memcg_tasklist);
3179} 3394}
3180 3395
3181struct cgroup_subsys mem_cgroup_subsys = { 3396struct cgroup_subsys mem_cgroup_subsys = {