diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 419 |
1 files changed, 317 insertions, 102 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e0c2066495e3..878808c4fcbe 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <linux/vmalloc.h> | 38 | #include <linux/vmalloc.h> |
39 | #include <linux/mm_inline.h> | 39 | #include <linux/mm_inline.h> |
40 | #include <linux/page_cgroup.h> | 40 | #include <linux/page_cgroup.h> |
41 | #include <linux/cpu.h> | ||
41 | #include "internal.h" | 42 | #include "internal.h" |
42 | 43 | ||
43 | #include <asm/uaccess.h> | 44 | #include <asm/uaccess.h> |
@@ -54,7 +55,6 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
54 | #define do_swap_account (0) | 55 | #define do_swap_account (0) |
55 | #endif | 56 | #endif |
56 | 57 | ||
57 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ | ||
58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | 58 | #define SOFTLIMIT_EVENTS_THRESH (1000) |
59 | 59 | ||
60 | /* | 60 | /* |
@@ -66,7 +66,7 @@ enum mem_cgroup_stat_index { | |||
66 | */ | 66 | */ |
67 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 67 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
68 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 68 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
69 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ | 69 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | 72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ |
@@ -275,6 +275,7 @@ enum charge_type { | |||
275 | static void mem_cgroup_get(struct mem_cgroup *mem); | 275 | static void mem_cgroup_get(struct mem_cgroup *mem); |
276 | static void mem_cgroup_put(struct mem_cgroup *mem); | 276 | static void mem_cgroup_put(struct mem_cgroup *mem); |
277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
278 | static void drain_all_stock_async(void); | ||
278 | 279 | ||
279 | static struct mem_cgroup_per_zone * | 280 | static struct mem_cgroup_per_zone * |
280 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | 281 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) |
@@ -758,7 +759,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
758 | task_unlock(task); | 759 | task_unlock(task); |
759 | if (!curr) | 760 | if (!curr) |
760 | return 0; | 761 | return 0; |
761 | if (curr->use_hierarchy) | 762 | /* |
763 | * We should check use_hierarchy of "mem" not "curr". Because checking | ||
764 | * use_hierarchy of "curr" here make this function true if hierarchy is | ||
765 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* | ||
766 | * hierarchy(even if use_hierarchy is disabled in "mem"). | ||
767 | */ | ||
768 | if (mem->use_hierarchy) | ||
762 | ret = css_is_ancestor(&curr->css, &mem->css); | 769 | ret = css_is_ancestor(&curr->css, &mem->css); |
763 | else | 770 | else |
764 | ret = (curr == mem); | 771 | ret = (curr == mem); |
@@ -1007,7 +1014,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1007 | static char memcg_name[PATH_MAX]; | 1014 | static char memcg_name[PATH_MAX]; |
1008 | int ret; | 1015 | int ret; |
1009 | 1016 | ||
1010 | if (!memcg) | 1017 | if (!memcg || !p) |
1011 | return; | 1018 | return; |
1012 | 1019 | ||
1013 | 1020 | ||
@@ -1137,6 +1144,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1137 | victim = mem_cgroup_select_victim(root_mem); | 1144 | victim = mem_cgroup_select_victim(root_mem); |
1138 | if (victim == root_mem) { | 1145 | if (victim == root_mem) { |
1139 | loop++; | 1146 | loop++; |
1147 | if (loop >= 1) | ||
1148 | drain_all_stock_async(); | ||
1140 | if (loop >= 2) { | 1149 | if (loop >= 2) { |
1141 | /* | 1150 | /* |
1142 | * If we have not been able to reclaim | 1151 | * If we have not been able to reclaim |
@@ -1223,7 +1232,7 @@ static void record_last_oom(struct mem_cgroup *mem) | |||
1223 | * Currently used to update mapped file statistics, but the routine can be | 1232 | * Currently used to update mapped file statistics, but the routine can be |
1224 | * generalized to update other statistics as well. | 1233 | * generalized to update other statistics as well. |
1225 | */ | 1234 | */ |
1226 | void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | 1235 | void mem_cgroup_update_file_mapped(struct page *page, int val) |
1227 | { | 1236 | { |
1228 | struct mem_cgroup *mem; | 1237 | struct mem_cgroup *mem; |
1229 | struct mem_cgroup_stat *stat; | 1238 | struct mem_cgroup_stat *stat; |
@@ -1231,9 +1240,6 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | |||
1231 | int cpu; | 1240 | int cpu; |
1232 | struct page_cgroup *pc; | 1241 | struct page_cgroup *pc; |
1233 | 1242 | ||
1234 | if (!page_is_file_cache(page)) | ||
1235 | return; | ||
1236 | |||
1237 | pc = lookup_page_cgroup(page); | 1243 | pc = lookup_page_cgroup(page); |
1238 | if (unlikely(!pc)) | 1244 | if (unlikely(!pc)) |
1239 | return; | 1245 | return; |
@@ -1253,12 +1259,139 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | |||
1253 | stat = &mem->stat; | 1259 | stat = &mem->stat; |
1254 | cpustat = &stat->cpustat[cpu]; | 1260 | cpustat = &stat->cpustat[cpu]; |
1255 | 1261 | ||
1256 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); | 1262 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val); |
1257 | done: | 1263 | done: |
1258 | unlock_page_cgroup(pc); | 1264 | unlock_page_cgroup(pc); |
1259 | } | 1265 | } |
1260 | 1266 | ||
1261 | /* | 1267 | /* |
1268 | * size of first charge trial. "32" comes from vmscan.c's magic value. | ||
1269 | * TODO: maybe necessary to use big numbers in big irons. | ||
1270 | */ | ||
1271 | #define CHARGE_SIZE (32 * PAGE_SIZE) | ||
1272 | struct memcg_stock_pcp { | ||
1273 | struct mem_cgroup *cached; /* this never be root cgroup */ | ||
1274 | int charge; | ||
1275 | struct work_struct work; | ||
1276 | }; | ||
1277 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | ||
1278 | static atomic_t memcg_drain_count; | ||
1279 | |||
1280 | /* | ||
1281 | * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed | ||
1282 | * from local stock and true is returned. If the stock is 0 or charges from a | ||
1283 | * cgroup which is not current target, returns false. This stock will be | ||
1284 | * refilled. | ||
1285 | */ | ||
1286 | static bool consume_stock(struct mem_cgroup *mem) | ||
1287 | { | ||
1288 | struct memcg_stock_pcp *stock; | ||
1289 | bool ret = true; | ||
1290 | |||
1291 | stock = &get_cpu_var(memcg_stock); | ||
1292 | if (mem == stock->cached && stock->charge) | ||
1293 | stock->charge -= PAGE_SIZE; | ||
1294 | else /* need to call res_counter_charge */ | ||
1295 | ret = false; | ||
1296 | put_cpu_var(memcg_stock); | ||
1297 | return ret; | ||
1298 | } | ||
1299 | |||
1300 | /* | ||
1301 | * Returns stocks cached in percpu to res_counter and reset cached information. | ||
1302 | */ | ||
1303 | static void drain_stock(struct memcg_stock_pcp *stock) | ||
1304 | { | ||
1305 | struct mem_cgroup *old = stock->cached; | ||
1306 | |||
1307 | if (stock->charge) { | ||
1308 | res_counter_uncharge(&old->res, stock->charge); | ||
1309 | if (do_swap_account) | ||
1310 | res_counter_uncharge(&old->memsw, stock->charge); | ||
1311 | } | ||
1312 | stock->cached = NULL; | ||
1313 | stock->charge = 0; | ||
1314 | } | ||
1315 | |||
1316 | /* | ||
1317 | * This must be called under preempt disabled or must be called by | ||
1318 | * a thread which is pinned to local cpu. | ||
1319 | */ | ||
1320 | static void drain_local_stock(struct work_struct *dummy) | ||
1321 | { | ||
1322 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); | ||
1323 | drain_stock(stock); | ||
1324 | } | ||
1325 | |||
1326 | /* | ||
1327 | * Cache charges(val) which is from res_counter, to local per_cpu area. | ||
1328 | * This will be consumed by consumt_stock() function, later. | ||
1329 | */ | ||
1330 | static void refill_stock(struct mem_cgroup *mem, int val) | ||
1331 | { | ||
1332 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); | ||
1333 | |||
1334 | if (stock->cached != mem) { /* reset if necessary */ | ||
1335 | drain_stock(stock); | ||
1336 | stock->cached = mem; | ||
1337 | } | ||
1338 | stock->charge += val; | ||
1339 | put_cpu_var(memcg_stock); | ||
1340 | } | ||
1341 | |||
1342 | /* | ||
1343 | * Tries to drain stocked charges in other cpus. This function is asynchronous | ||
1344 | * and just put a work per cpu for draining localy on each cpu. Caller can | ||
1345 | * expects some charges will be back to res_counter later but cannot wait for | ||
1346 | * it. | ||
1347 | */ | ||
1348 | static void drain_all_stock_async(void) | ||
1349 | { | ||
1350 | int cpu; | ||
1351 | /* This function is for scheduling "drain" in asynchronous way. | ||
1352 | * The result of "drain" is not directly handled by callers. Then, | ||
1353 | * if someone is calling drain, we don't have to call drain more. | ||
1354 | * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if | ||
1355 | * there is a race. We just do loose check here. | ||
1356 | */ | ||
1357 | if (atomic_read(&memcg_drain_count)) | ||
1358 | return; | ||
1359 | /* Notify other cpus that system-wide "drain" is running */ | ||
1360 | atomic_inc(&memcg_drain_count); | ||
1361 | get_online_cpus(); | ||
1362 | for_each_online_cpu(cpu) { | ||
1363 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | ||
1364 | schedule_work_on(cpu, &stock->work); | ||
1365 | } | ||
1366 | put_online_cpus(); | ||
1367 | atomic_dec(&memcg_drain_count); | ||
1368 | /* We don't wait for flush_work */ | ||
1369 | } | ||
1370 | |||
1371 | /* This is a synchronous drain interface. */ | ||
1372 | static void drain_all_stock_sync(void) | ||
1373 | { | ||
1374 | /* called when force_empty is called */ | ||
1375 | atomic_inc(&memcg_drain_count); | ||
1376 | schedule_on_each_cpu(drain_local_stock); | ||
1377 | atomic_dec(&memcg_drain_count); | ||
1378 | } | ||
1379 | |||
1380 | static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | ||
1381 | unsigned long action, | ||
1382 | void *hcpu) | ||
1383 | { | ||
1384 | int cpu = (unsigned long)hcpu; | ||
1385 | struct memcg_stock_pcp *stock; | ||
1386 | |||
1387 | if (action != CPU_DEAD) | ||
1388 | return NOTIFY_OK; | ||
1389 | stock = &per_cpu(memcg_stock, cpu); | ||
1390 | drain_stock(stock); | ||
1391 | return NOTIFY_OK; | ||
1392 | } | ||
1393 | |||
1394 | /* | ||
1262 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 1395 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
1263 | * oom-killer can be invoked. | 1396 | * oom-killer can be invoked. |
1264 | */ | 1397 | */ |
@@ -1269,6 +1402,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1269 | struct mem_cgroup *mem, *mem_over_limit; | 1402 | struct mem_cgroup *mem, *mem_over_limit; |
1270 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1403 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1271 | struct res_counter *fail_res; | 1404 | struct res_counter *fail_res; |
1405 | int csize = CHARGE_SIZE; | ||
1272 | 1406 | ||
1273 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1407 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { |
1274 | /* Don't account this! */ | 1408 | /* Don't account this! */ |
@@ -1293,23 +1427,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1293 | return 0; | 1427 | return 0; |
1294 | 1428 | ||
1295 | VM_BUG_ON(css_is_removed(&mem->css)); | 1429 | VM_BUG_ON(css_is_removed(&mem->css)); |
1430 | if (mem_cgroup_is_root(mem)) | ||
1431 | goto done; | ||
1296 | 1432 | ||
1297 | while (1) { | 1433 | while (1) { |
1298 | int ret = 0; | 1434 | int ret = 0; |
1299 | unsigned long flags = 0; | 1435 | unsigned long flags = 0; |
1300 | 1436 | ||
1301 | if (mem_cgroup_is_root(mem)) | 1437 | if (consume_stock(mem)) |
1302 | goto done; | 1438 | goto charged; |
1303 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | 1439 | |
1440 | ret = res_counter_charge(&mem->res, csize, &fail_res); | ||
1304 | if (likely(!ret)) { | 1441 | if (likely(!ret)) { |
1305 | if (!do_swap_account) | 1442 | if (!do_swap_account) |
1306 | break; | 1443 | break; |
1307 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, | 1444 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); |
1308 | &fail_res); | ||
1309 | if (likely(!ret)) | 1445 | if (likely(!ret)) |
1310 | break; | 1446 | break; |
1311 | /* mem+swap counter fails */ | 1447 | /* mem+swap counter fails */ |
1312 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1448 | res_counter_uncharge(&mem->res, csize); |
1313 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 1449 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1314 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1450 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1315 | memsw); | 1451 | memsw); |
@@ -1318,6 +1454,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1318 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1454 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1319 | res); | 1455 | res); |
1320 | 1456 | ||
1457 | /* reduce request size and retry */ | ||
1458 | if (csize > PAGE_SIZE) { | ||
1459 | csize = PAGE_SIZE; | ||
1460 | continue; | ||
1461 | } | ||
1321 | if (!(gfp_mask & __GFP_WAIT)) | 1462 | if (!(gfp_mask & __GFP_WAIT)) |
1322 | goto nomem; | 1463 | goto nomem; |
1323 | 1464 | ||
@@ -1339,14 +1480,15 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1339 | 1480 | ||
1340 | if (!nr_retries--) { | 1481 | if (!nr_retries--) { |
1341 | if (oom) { | 1482 | if (oom) { |
1342 | mutex_lock(&memcg_tasklist); | ||
1343 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | 1483 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); |
1344 | mutex_unlock(&memcg_tasklist); | ||
1345 | record_last_oom(mem_over_limit); | 1484 | record_last_oom(mem_over_limit); |
1346 | } | 1485 | } |
1347 | goto nomem; | 1486 | goto nomem; |
1348 | } | 1487 | } |
1349 | } | 1488 | } |
1489 | if (csize > PAGE_SIZE) | ||
1490 | refill_stock(mem, csize - PAGE_SIZE); | ||
1491 | charged: | ||
1350 | /* | 1492 | /* |
1351 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 1493 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
1352 | * if they exceeds softlimit. | 1494 | * if they exceeds softlimit. |
@@ -1361,6 +1503,21 @@ nomem: | |||
1361 | } | 1503 | } |
1362 | 1504 | ||
1363 | /* | 1505 | /* |
1506 | * Somemtimes we have to undo a charge we got by try_charge(). | ||
1507 | * This function is for that and do uncharge, put css's refcnt. | ||
1508 | * gotten by try_charge(). | ||
1509 | */ | ||
1510 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | ||
1511 | { | ||
1512 | if (!mem_cgroup_is_root(mem)) { | ||
1513 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1514 | if (do_swap_account) | ||
1515 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1516 | } | ||
1517 | css_put(&mem->css); | ||
1518 | } | ||
1519 | |||
1520 | /* | ||
1364 | * A helper function to get mem_cgroup from ID. must be called under | 1521 | * A helper function to get mem_cgroup from ID. must be called under |
1365 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 1522 | * rcu_read_lock(). The caller must check css_is_removed() or some if |
1366 | * it's concern. (dropping refcnt from swap can be called against removed | 1523 | * it's concern. (dropping refcnt from swap can be called against removed |
@@ -1426,12 +1583,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1426 | lock_page_cgroup(pc); | 1583 | lock_page_cgroup(pc); |
1427 | if (unlikely(PageCgroupUsed(pc))) { | 1584 | if (unlikely(PageCgroupUsed(pc))) { |
1428 | unlock_page_cgroup(pc); | 1585 | unlock_page_cgroup(pc); |
1429 | if (!mem_cgroup_is_root(mem)) { | 1586 | mem_cgroup_cancel_charge(mem); |
1430 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1431 | if (do_swap_account) | ||
1432 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1433 | } | ||
1434 | css_put(&mem->css); | ||
1435 | return; | 1587 | return; |
1436 | } | 1588 | } |
1437 | 1589 | ||
@@ -1464,27 +1616,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1464 | } | 1616 | } |
1465 | 1617 | ||
1466 | /** | 1618 | /** |
1467 | * mem_cgroup_move_account - move account of the page | 1619 | * __mem_cgroup_move_account - move account of the page |
1468 | * @pc: page_cgroup of the page. | 1620 | * @pc: page_cgroup of the page. |
1469 | * @from: mem_cgroup which the page is moved from. | 1621 | * @from: mem_cgroup which the page is moved from. |
1470 | * @to: mem_cgroup which the page is moved to. @from != @to. | 1622 | * @to: mem_cgroup which the page is moved to. @from != @to. |
1471 | * | 1623 | * |
1472 | * The caller must confirm following. | 1624 | * The caller must confirm following. |
1473 | * - page is not on LRU (isolate_page() is useful.) | 1625 | * - page is not on LRU (isolate_page() is useful.) |
1474 | * | 1626 | * - the pc is locked, used, and ->mem_cgroup points to @from. |
1475 | * returns 0 at success, | ||
1476 | * returns -EBUSY when lock is busy or "pc" is unstable. | ||
1477 | * | 1627 | * |
1478 | * This function does "uncharge" from old cgroup but doesn't do "charge" to | 1628 | * This function does "uncharge" from old cgroup but doesn't do "charge" to |
1479 | * new cgroup. It should be done by a caller. | 1629 | * new cgroup. It should be done by a caller. |
1480 | */ | 1630 | */ |
1481 | 1631 | ||
1482 | static int mem_cgroup_move_account(struct page_cgroup *pc, | 1632 | static void __mem_cgroup_move_account(struct page_cgroup *pc, |
1483 | struct mem_cgroup *from, struct mem_cgroup *to) | 1633 | struct mem_cgroup *from, struct mem_cgroup *to) |
1484 | { | 1634 | { |
1485 | struct mem_cgroup_per_zone *from_mz, *to_mz; | ||
1486 | int nid, zid; | ||
1487 | int ret = -EBUSY; | ||
1488 | struct page *page; | 1635 | struct page *page; |
1489 | int cpu; | 1636 | int cpu; |
1490 | struct mem_cgroup_stat *stat; | 1637 | struct mem_cgroup_stat *stat; |
@@ -1492,38 +1639,27 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1492 | 1639 | ||
1493 | VM_BUG_ON(from == to); | 1640 | VM_BUG_ON(from == to); |
1494 | VM_BUG_ON(PageLRU(pc->page)); | 1641 | VM_BUG_ON(PageLRU(pc->page)); |
1495 | 1642 | VM_BUG_ON(!PageCgroupLocked(pc)); | |
1496 | nid = page_cgroup_nid(pc); | 1643 | VM_BUG_ON(!PageCgroupUsed(pc)); |
1497 | zid = page_cgroup_zid(pc); | 1644 | VM_BUG_ON(pc->mem_cgroup != from); |
1498 | from_mz = mem_cgroup_zoneinfo(from, nid, zid); | ||
1499 | to_mz = mem_cgroup_zoneinfo(to, nid, zid); | ||
1500 | |||
1501 | if (!trylock_page_cgroup(pc)) | ||
1502 | return ret; | ||
1503 | |||
1504 | if (!PageCgroupUsed(pc)) | ||
1505 | goto out; | ||
1506 | |||
1507 | if (pc->mem_cgroup != from) | ||
1508 | goto out; | ||
1509 | 1645 | ||
1510 | if (!mem_cgroup_is_root(from)) | 1646 | if (!mem_cgroup_is_root(from)) |
1511 | res_counter_uncharge(&from->res, PAGE_SIZE); | 1647 | res_counter_uncharge(&from->res, PAGE_SIZE); |
1512 | mem_cgroup_charge_statistics(from, pc, false); | 1648 | mem_cgroup_charge_statistics(from, pc, false); |
1513 | 1649 | ||
1514 | page = pc->page; | 1650 | page = pc->page; |
1515 | if (page_is_file_cache(page) && page_mapped(page)) { | 1651 | if (page_mapped(page) && !PageAnon(page)) { |
1516 | cpu = smp_processor_id(); | 1652 | cpu = smp_processor_id(); |
1517 | /* Update mapped_file data for mem_cgroup "from" */ | 1653 | /* Update mapped_file data for mem_cgroup "from" */ |
1518 | stat = &from->stat; | 1654 | stat = &from->stat; |
1519 | cpustat = &stat->cpustat[cpu]; | 1655 | cpustat = &stat->cpustat[cpu]; |
1520 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | 1656 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, |
1521 | -1); | 1657 | -1); |
1522 | 1658 | ||
1523 | /* Update mapped_file data for mem_cgroup "to" */ | 1659 | /* Update mapped_file data for mem_cgroup "to" */ |
1524 | stat = &to->stat; | 1660 | stat = &to->stat; |
1525 | cpustat = &stat->cpustat[cpu]; | 1661 | cpustat = &stat->cpustat[cpu]; |
1526 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | 1662 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, |
1527 | 1); | 1663 | 1); |
1528 | } | 1664 | } |
1529 | 1665 | ||
@@ -1534,15 +1670,28 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1534 | css_get(&to->css); | 1670 | css_get(&to->css); |
1535 | pc->mem_cgroup = to; | 1671 | pc->mem_cgroup = to; |
1536 | mem_cgroup_charge_statistics(to, pc, true); | 1672 | mem_cgroup_charge_statistics(to, pc, true); |
1537 | ret = 0; | ||
1538 | out: | ||
1539 | unlock_page_cgroup(pc); | ||
1540 | /* | 1673 | /* |
1541 | * We charges against "to" which may not have any tasks. Then, "to" | 1674 | * We charges against "to" which may not have any tasks. Then, "to" |
1542 | * can be under rmdir(). But in current implementation, caller of | 1675 | * can be under rmdir(). But in current implementation, caller of |
1543 | * this function is just force_empty() and it's garanteed that | 1676 | * this function is just force_empty() and it's garanteed that |
1544 | * "to" is never removed. So, we don't check rmdir status here. | 1677 | * "to" is never removed. So, we don't check rmdir status here. |
1545 | */ | 1678 | */ |
1679 | } | ||
1680 | |||
1681 | /* | ||
1682 | * check whether the @pc is valid for moving account and call | ||
1683 | * __mem_cgroup_move_account() | ||
1684 | */ | ||
1685 | static int mem_cgroup_move_account(struct page_cgroup *pc, | ||
1686 | struct mem_cgroup *from, struct mem_cgroup *to) | ||
1687 | { | ||
1688 | int ret = -EINVAL; | ||
1689 | lock_page_cgroup(pc); | ||
1690 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | ||
1691 | __mem_cgroup_move_account(pc, from, to); | ||
1692 | ret = 0; | ||
1693 | } | ||
1694 | unlock_page_cgroup(pc); | ||
1546 | return ret; | 1695 | return ret; |
1547 | } | 1696 | } |
1548 | 1697 | ||
@@ -1564,45 +1713,27 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
1564 | if (!pcg) | 1713 | if (!pcg) |
1565 | return -EINVAL; | 1714 | return -EINVAL; |
1566 | 1715 | ||
1716 | ret = -EBUSY; | ||
1717 | if (!get_page_unless_zero(page)) | ||
1718 | goto out; | ||
1719 | if (isolate_lru_page(page)) | ||
1720 | goto put; | ||
1567 | 1721 | ||
1568 | parent = mem_cgroup_from_cont(pcg); | 1722 | parent = mem_cgroup_from_cont(pcg); |
1569 | |||
1570 | |||
1571 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); | 1723 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); |
1572 | if (ret || !parent) | 1724 | if (ret || !parent) |
1573 | return ret; | 1725 | goto put_back; |
1574 | |||
1575 | if (!get_page_unless_zero(page)) { | ||
1576 | ret = -EBUSY; | ||
1577 | goto uncharge; | ||
1578 | } | ||
1579 | |||
1580 | ret = isolate_lru_page(page); | ||
1581 | |||
1582 | if (ret) | ||
1583 | goto cancel; | ||
1584 | 1726 | ||
1585 | ret = mem_cgroup_move_account(pc, child, parent); | 1727 | ret = mem_cgroup_move_account(pc, child, parent); |
1586 | 1728 | if (!ret) | |
1729 | css_put(&parent->css); /* drop extra refcnt by try_charge() */ | ||
1730 | else | ||
1731 | mem_cgroup_cancel_charge(parent); /* does css_put */ | ||
1732 | put_back: | ||
1587 | putback_lru_page(page); | 1733 | putback_lru_page(page); |
1588 | if (!ret) { | 1734 | put: |
1589 | put_page(page); | ||
1590 | /* drop extra refcnt by try_charge() */ | ||
1591 | css_put(&parent->css); | ||
1592 | return 0; | ||
1593 | } | ||
1594 | |||
1595 | cancel: | ||
1596 | put_page(page); | 1735 | put_page(page); |
1597 | uncharge: | 1736 | out: |
1598 | /* drop extra refcnt by try_charge() */ | ||
1599 | css_put(&parent->css); | ||
1600 | /* uncharge if move fails */ | ||
1601 | if (!mem_cgroup_is_root(parent)) { | ||
1602 | res_counter_uncharge(&parent->res, PAGE_SIZE); | ||
1603 | if (do_swap_account) | ||
1604 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | ||
1605 | } | ||
1606 | return ret; | 1737 | return ret; |
1607 | } | 1738 | } |
1608 | 1739 | ||
@@ -1819,14 +1950,53 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
1819 | return; | 1950 | return; |
1820 | if (!mem) | 1951 | if (!mem) |
1821 | return; | 1952 | return; |
1822 | if (!mem_cgroup_is_root(mem)) { | 1953 | mem_cgroup_cancel_charge(mem); |
1823 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1824 | if (do_swap_account) | ||
1825 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1826 | } | ||
1827 | css_put(&mem->css); | ||
1828 | } | 1954 | } |
1829 | 1955 | ||
1956 | static void | ||
1957 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | ||
1958 | { | ||
1959 | struct memcg_batch_info *batch = NULL; | ||
1960 | bool uncharge_memsw = true; | ||
1961 | /* If swapout, usage of swap doesn't decrease */ | ||
1962 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
1963 | uncharge_memsw = false; | ||
1964 | /* | ||
1965 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
1966 | * In those cases, all pages freed continously can be expected to be in | ||
1967 | * the same cgroup and we have chance to coalesce uncharges. | ||
1968 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
1969 | * because we want to do uncharge as soon as possible. | ||
1970 | */ | ||
1971 | if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) | ||
1972 | goto direct_uncharge; | ||
1973 | |||
1974 | batch = ¤t->memcg_batch; | ||
1975 | /* | ||
1976 | * In usual, we do css_get() when we remember memcg pointer. | ||
1977 | * But in this case, we keep res->usage until end of a series of | ||
1978 | * uncharges. Then, it's ok to ignore memcg's refcnt. | ||
1979 | */ | ||
1980 | if (!batch->memcg) | ||
1981 | batch->memcg = mem; | ||
1982 | /* | ||
1983 | * In typical case, batch->memcg == mem. This means we can | ||
1984 | * merge a series of uncharges to an uncharge of res_counter. | ||
1985 | * If not, we uncharge res_counter ony by one. | ||
1986 | */ | ||
1987 | if (batch->memcg != mem) | ||
1988 | goto direct_uncharge; | ||
1989 | /* remember freed charge and uncharge it later */ | ||
1990 | batch->bytes += PAGE_SIZE; | ||
1991 | if (uncharge_memsw) | ||
1992 | batch->memsw_bytes += PAGE_SIZE; | ||
1993 | return; | ||
1994 | direct_uncharge: | ||
1995 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1996 | if (uncharge_memsw) | ||
1997 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1998 | return; | ||
1999 | } | ||
1830 | 2000 | ||
1831 | /* | 2001 | /* |
1832 | * uncharge if !page_mapped(page) | 2002 | * uncharge if !page_mapped(page) |
@@ -1875,12 +2045,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1875 | break; | 2045 | break; |
1876 | } | 2046 | } |
1877 | 2047 | ||
1878 | if (!mem_cgroup_is_root(mem)) { | 2048 | if (!mem_cgroup_is_root(mem)) |
1879 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 2049 | __do_uncharge(mem, ctype); |
1880 | if (do_swap_account && | ||
1881 | (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
1882 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1883 | } | ||
1884 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2050 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
1885 | mem_cgroup_swap_statistics(mem, true); | 2051 | mem_cgroup_swap_statistics(mem, true); |
1886 | mem_cgroup_charge_statistics(mem, pc, false); | 2052 | mem_cgroup_charge_statistics(mem, pc, false); |
@@ -1926,6 +2092,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page) | |||
1926 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 2092 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); |
1927 | } | 2093 | } |
1928 | 2094 | ||
2095 | /* | ||
2096 | * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. | ||
2097 | * In that cases, pages are freed continuously and we can expect pages | ||
2098 | * are in the same memcg. All these calls itself limits the number of | ||
2099 | * pages freed at once, then uncharge_start/end() is called properly. | ||
2100 | * This may be called prural(2) times in a context, | ||
2101 | */ | ||
2102 | |||
2103 | void mem_cgroup_uncharge_start(void) | ||
2104 | { | ||
2105 | current->memcg_batch.do_batch++; | ||
2106 | /* We can do nest. */ | ||
2107 | if (current->memcg_batch.do_batch == 1) { | ||
2108 | current->memcg_batch.memcg = NULL; | ||
2109 | current->memcg_batch.bytes = 0; | ||
2110 | current->memcg_batch.memsw_bytes = 0; | ||
2111 | } | ||
2112 | } | ||
2113 | |||
2114 | void mem_cgroup_uncharge_end(void) | ||
2115 | { | ||
2116 | struct memcg_batch_info *batch = ¤t->memcg_batch; | ||
2117 | |||
2118 | if (!batch->do_batch) | ||
2119 | return; | ||
2120 | |||
2121 | batch->do_batch--; | ||
2122 | if (batch->do_batch) /* If stacked, do nothing. */ | ||
2123 | return; | ||
2124 | |||
2125 | if (!batch->memcg) | ||
2126 | return; | ||
2127 | /* | ||
2128 | * This "batch->memcg" is valid without any css_get/put etc... | ||
2129 | * bacause we hide charges behind us. | ||
2130 | */ | ||
2131 | if (batch->bytes) | ||
2132 | res_counter_uncharge(&batch->memcg->res, batch->bytes); | ||
2133 | if (batch->memsw_bytes) | ||
2134 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | ||
2135 | /* forget this pointer (for sanity check) */ | ||
2136 | batch->memcg = NULL; | ||
2137 | } | ||
2138 | |||
1929 | #ifdef CONFIG_SWAP | 2139 | #ifdef CONFIG_SWAP |
1930 | /* | 2140 | /* |
1931 | * called after __delete_from_swap_cache() and drop "page" account. | 2141 | * called after __delete_from_swap_cache() and drop "page" account. |
@@ -2101,7 +2311,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2101 | unsigned long long val) | 2311 | unsigned long long val) |
2102 | { | 2312 | { |
2103 | int retry_count; | 2313 | int retry_count; |
2104 | int progress; | ||
2105 | u64 memswlimit; | 2314 | u64 memswlimit; |
2106 | int ret = 0; | 2315 | int ret = 0; |
2107 | int children = mem_cgroup_count_children(memcg); | 2316 | int children = mem_cgroup_count_children(memcg); |
@@ -2145,8 +2354,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2145 | if (!ret) | 2354 | if (!ret) |
2146 | break; | 2355 | break; |
2147 | 2356 | ||
2148 | progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, | 2357 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
2149 | GFP_KERNEL, | ||
2150 | MEM_CGROUP_RECLAIM_SHRINK); | 2358 | MEM_CGROUP_RECLAIM_SHRINK); |
2151 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2359 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
2152 | /* Usage is reduced ? */ | 2360 | /* Usage is reduced ? */ |
@@ -2385,6 +2593,7 @@ move_account: | |||
2385 | goto out; | 2593 | goto out; |
2386 | /* This is for making all *used* pages to be on LRU. */ | 2594 | /* This is for making all *used* pages to be on LRU. */ |
2387 | lru_add_drain_all(); | 2595 | lru_add_drain_all(); |
2596 | drain_all_stock_sync(); | ||
2388 | ret = 0; | 2597 | ret = 0; |
2389 | for_each_node_state(node, N_HIGH_MEMORY) { | 2598 | for_each_node_state(node, N_HIGH_MEMORY) { |
2390 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 2599 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
@@ -2542,6 +2751,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | |||
2542 | val += idx_val; | 2751 | val += idx_val; |
2543 | mem_cgroup_get_recursive_idx_stat(mem, | 2752 | mem_cgroup_get_recursive_idx_stat(mem, |
2544 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | 2753 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); |
2754 | val += idx_val; | ||
2545 | val <<= PAGE_SHIFT; | 2755 | val <<= PAGE_SHIFT; |
2546 | } else | 2756 | } else |
2547 | val = res_counter_read_u64(&mem->memsw, name); | 2757 | val = res_counter_read_u64(&mem->memsw, name); |
@@ -2661,7 +2871,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
2661 | enum { | 2871 | enum { |
2662 | MCS_CACHE, | 2872 | MCS_CACHE, |
2663 | MCS_RSS, | 2873 | MCS_RSS, |
2664 | MCS_MAPPED_FILE, | 2874 | MCS_FILE_MAPPED, |
2665 | MCS_PGPGIN, | 2875 | MCS_PGPGIN, |
2666 | MCS_PGPGOUT, | 2876 | MCS_PGPGOUT, |
2667 | MCS_SWAP, | 2877 | MCS_SWAP, |
@@ -2705,8 +2915,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
2705 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | 2915 | s->stat[MCS_CACHE] += val * PAGE_SIZE; |
2706 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | 2916 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); |
2707 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 2917 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
2708 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); | 2918 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED); |
2709 | s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; | 2919 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; |
2710 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); | 2920 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); |
2711 | s->stat[MCS_PGPGIN] += val; | 2921 | s->stat[MCS_PGPGIN] += val; |
2712 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 2922 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); |
@@ -3098,11 +3308,18 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3098 | 3308 | ||
3099 | /* root ? */ | 3309 | /* root ? */ |
3100 | if (cont->parent == NULL) { | 3310 | if (cont->parent == NULL) { |
3311 | int cpu; | ||
3101 | enable_swap_cgroup(); | 3312 | enable_swap_cgroup(); |
3102 | parent = NULL; | 3313 | parent = NULL; |
3103 | root_mem_cgroup = mem; | 3314 | root_mem_cgroup = mem; |
3104 | if (mem_cgroup_soft_limit_tree_init()) | 3315 | if (mem_cgroup_soft_limit_tree_init()) |
3105 | goto free_out; | 3316 | goto free_out; |
3317 | for_each_possible_cpu(cpu) { | ||
3318 | struct memcg_stock_pcp *stock = | ||
3319 | &per_cpu(memcg_stock, cpu); | ||
3320 | INIT_WORK(&stock->work, drain_local_stock); | ||
3321 | } | ||
3322 | hotcpu_notifier(memcg_stock_cpu_callback, 0); | ||
3106 | 3323 | ||
3107 | } else { | 3324 | } else { |
3108 | parent = mem_cgroup_from_cont(cont->parent); | 3325 | parent = mem_cgroup_from_cont(cont->parent); |
@@ -3171,12 +3388,10 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
3171 | struct task_struct *p, | 3388 | struct task_struct *p, |
3172 | bool threadgroup) | 3389 | bool threadgroup) |
3173 | { | 3390 | { |
3174 | mutex_lock(&memcg_tasklist); | ||
3175 | /* | 3391 | /* |
3176 | * FIXME: It's better to move charges of this process from old | 3392 | * FIXME: It's better to move charges of this process from old |
3177 | * memcg to new memcg. But it's just on TODO-List now. | 3393 | * memcg to new memcg. But it's just on TODO-List now. |
3178 | */ | 3394 | */ |
3179 | mutex_unlock(&memcg_tasklist); | ||
3180 | } | 3395 | } |
3181 | 3396 | ||
3182 | struct cgroup_subsys mem_cgroup_subsys = { | 3397 | struct cgroup_subsys mem_cgroup_subsys = { |