diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2009-12-15 19:47:08 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-16 10:20:07 -0500 |
commit | cdec2e4265dfa09490601b00aeabd8a8d4af30f0 (patch) | |
tree | d120759b91e3e24d33f3d76010c9da990f115706 | |
parent | 569b846df54ffb2827b83ce3244c5f032394cba4 (diff) |
memcg: coalesce charging via percpu storage
This is a patch for coalescing access to res_counter at charging by percpu
caching. At charge, memcg charges 64pages and remember it in percpu
cache. Because it's cache, drain/flush if necessary.
This version uses public percpu area.
2 benefits for using public percpu area.
1. Sum of stocked charge in the system is limited to # of cpus
not to the number of memcg. This shows better synchonization.
2. drain code for flush/cpuhotplug is very easy (and quick)
The most important point of this patch is that we never touch res_counter
in fast path. The res_counter is system-wide shared counter which is modified
very frequently. We shouldn't touch it as far as we can for avoiding
false sharing.
On x86-64 8cpu server, I tested overheads of memcg at page fault by
running a program which does map/fault/unmap in a loop. Running
a task per a cpu by taskset and see sum of the number of page faults
in 60secs.
[without memcg config]
40156968 page-faults # 0.085 M/sec ( +- 0.046% )
27.67 cache-miss/faults
[root cgroup]
36659599 page-faults # 0.077 M/sec ( +- 0.247% )
31.58 cache miss/faults
[in a child cgroup]
18444157 page-faults # 0.039 M/sec ( +- 0.133% )
69.96 cache miss/faults
[ + coalescing uncharge patch]
27133719 page-faults # 0.057 M/sec ( +- 0.155% )
47.16 cache miss/faults
[ + coalescing uncharge patch + this patch ]
34224709 page-faults # 0.072 M/sec ( +- 0.173% )
34.69 cache miss/faults
Changelog (since Oct/2):
- updated comments
- replaced get_cpu_var() with __get_cpu_var() if possible.
- removed mutex for system-wide drain. adds a counter instead of it.
- removed CONFIG_HOTPLUG_CPU
Changelog (old):
- rebased onto the latest mmotm
- moved charge size check before __GFP_WAIT check for avoiding unnecesary
- added asynchronous flush routine.
- fixed bugs pointed out by Nishimura-san.
[akpm@linux-foundation.org: tweak comments]
[nishimura@mxp.nes.nec.co.jp: don't do INIT_WORK() repeatedly against the same work_struct]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/memcontrol.c | 162 |
1 files changed, 156 insertions, 6 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a730c91b8e69..6587f657d57c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <linux/vmalloc.h> | 38 | #include <linux/vmalloc.h> |
39 | #include <linux/mm_inline.h> | 39 | #include <linux/mm_inline.h> |
40 | #include <linux/page_cgroup.h> | 40 | #include <linux/page_cgroup.h> |
41 | #include <linux/cpu.h> | ||
41 | #include "internal.h" | 42 | #include "internal.h" |
42 | 43 | ||
43 | #include <asm/uaccess.h> | 44 | #include <asm/uaccess.h> |
@@ -275,6 +276,7 @@ enum charge_type { | |||
275 | static void mem_cgroup_get(struct mem_cgroup *mem); | 276 | static void mem_cgroup_get(struct mem_cgroup *mem); |
276 | static void mem_cgroup_put(struct mem_cgroup *mem); | 277 | static void mem_cgroup_put(struct mem_cgroup *mem); |
277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 278 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
279 | static void drain_all_stock_async(void); | ||
278 | 280 | ||
279 | static struct mem_cgroup_per_zone * | 281 | static struct mem_cgroup_per_zone * |
280 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | 282 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) |
@@ -1137,6 +1139,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1137 | victim = mem_cgroup_select_victim(root_mem); | 1139 | victim = mem_cgroup_select_victim(root_mem); |
1138 | if (victim == root_mem) { | 1140 | if (victim == root_mem) { |
1139 | loop++; | 1141 | loop++; |
1142 | if (loop >= 1) | ||
1143 | drain_all_stock_async(); | ||
1140 | if (loop >= 2) { | 1144 | if (loop >= 2) { |
1141 | /* | 1145 | /* |
1142 | * If we have not been able to reclaim | 1146 | * If we have not been able to reclaim |
@@ -1259,6 +1263,133 @@ done: | |||
1259 | } | 1263 | } |
1260 | 1264 | ||
1261 | /* | 1265 | /* |
1266 | * size of first charge trial. "32" comes from vmscan.c's magic value. | ||
1267 | * TODO: maybe necessary to use big numbers in big irons. | ||
1268 | */ | ||
1269 | #define CHARGE_SIZE (32 * PAGE_SIZE) | ||
1270 | struct memcg_stock_pcp { | ||
1271 | struct mem_cgroup *cached; /* this never be root cgroup */ | ||
1272 | int charge; | ||
1273 | struct work_struct work; | ||
1274 | }; | ||
1275 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | ||
1276 | static atomic_t memcg_drain_count; | ||
1277 | |||
1278 | /* | ||
1279 | * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed | ||
1280 | * from local stock and true is returned. If the stock is 0 or charges from a | ||
1281 | * cgroup which is not current target, returns false. This stock will be | ||
1282 | * refilled. | ||
1283 | */ | ||
1284 | static bool consume_stock(struct mem_cgroup *mem) | ||
1285 | { | ||
1286 | struct memcg_stock_pcp *stock; | ||
1287 | bool ret = true; | ||
1288 | |||
1289 | stock = &get_cpu_var(memcg_stock); | ||
1290 | if (mem == stock->cached && stock->charge) | ||
1291 | stock->charge -= PAGE_SIZE; | ||
1292 | else /* need to call res_counter_charge */ | ||
1293 | ret = false; | ||
1294 | put_cpu_var(memcg_stock); | ||
1295 | return ret; | ||
1296 | } | ||
1297 | |||
1298 | /* | ||
1299 | * Returns stocks cached in percpu to res_counter and reset cached information. | ||
1300 | */ | ||
1301 | static void drain_stock(struct memcg_stock_pcp *stock) | ||
1302 | { | ||
1303 | struct mem_cgroup *old = stock->cached; | ||
1304 | |||
1305 | if (stock->charge) { | ||
1306 | res_counter_uncharge(&old->res, stock->charge); | ||
1307 | if (do_swap_account) | ||
1308 | res_counter_uncharge(&old->memsw, stock->charge); | ||
1309 | } | ||
1310 | stock->cached = NULL; | ||
1311 | stock->charge = 0; | ||
1312 | } | ||
1313 | |||
1314 | /* | ||
1315 | * This must be called under preempt disabled or must be called by | ||
1316 | * a thread which is pinned to local cpu. | ||
1317 | */ | ||
1318 | static void drain_local_stock(struct work_struct *dummy) | ||
1319 | { | ||
1320 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); | ||
1321 | drain_stock(stock); | ||
1322 | } | ||
1323 | |||
1324 | /* | ||
1325 | * Cache charges(val) which is from res_counter, to local per_cpu area. | ||
1326 | * This will be consumed by consumt_stock() function, later. | ||
1327 | */ | ||
1328 | static void refill_stock(struct mem_cgroup *mem, int val) | ||
1329 | { | ||
1330 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); | ||
1331 | |||
1332 | if (stock->cached != mem) { /* reset if necessary */ | ||
1333 | drain_stock(stock); | ||
1334 | stock->cached = mem; | ||
1335 | } | ||
1336 | stock->charge += val; | ||
1337 | put_cpu_var(memcg_stock); | ||
1338 | } | ||
1339 | |||
1340 | /* | ||
1341 | * Tries to drain stocked charges in other cpus. This function is asynchronous | ||
1342 | * and just put a work per cpu for draining localy on each cpu. Caller can | ||
1343 | * expects some charges will be back to res_counter later but cannot wait for | ||
1344 | * it. | ||
1345 | */ | ||
1346 | static void drain_all_stock_async(void) | ||
1347 | { | ||
1348 | int cpu; | ||
1349 | /* This function is for scheduling "drain" in asynchronous way. | ||
1350 | * The result of "drain" is not directly handled by callers. Then, | ||
1351 | * if someone is calling drain, we don't have to call drain more. | ||
1352 | * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if | ||
1353 | * there is a race. We just do loose check here. | ||
1354 | */ | ||
1355 | if (atomic_read(&memcg_drain_count)) | ||
1356 | return; | ||
1357 | /* Notify other cpus that system-wide "drain" is running */ | ||
1358 | atomic_inc(&memcg_drain_count); | ||
1359 | get_online_cpus(); | ||
1360 | for_each_online_cpu(cpu) { | ||
1361 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | ||
1362 | schedule_work_on(cpu, &stock->work); | ||
1363 | } | ||
1364 | put_online_cpus(); | ||
1365 | atomic_dec(&memcg_drain_count); | ||
1366 | /* We don't wait for flush_work */ | ||
1367 | } | ||
1368 | |||
1369 | /* This is a synchronous drain interface. */ | ||
1370 | static void drain_all_stock_sync(void) | ||
1371 | { | ||
1372 | /* called when force_empty is called */ | ||
1373 | atomic_inc(&memcg_drain_count); | ||
1374 | schedule_on_each_cpu(drain_local_stock); | ||
1375 | atomic_dec(&memcg_drain_count); | ||
1376 | } | ||
1377 | |||
1378 | static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | ||
1379 | unsigned long action, | ||
1380 | void *hcpu) | ||
1381 | { | ||
1382 | int cpu = (unsigned long)hcpu; | ||
1383 | struct memcg_stock_pcp *stock; | ||
1384 | |||
1385 | if (action != CPU_DEAD) | ||
1386 | return NOTIFY_OK; | ||
1387 | stock = &per_cpu(memcg_stock, cpu); | ||
1388 | drain_stock(stock); | ||
1389 | return NOTIFY_OK; | ||
1390 | } | ||
1391 | |||
1392 | /* | ||
1262 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 1393 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
1263 | * oom-killer can be invoked. | 1394 | * oom-killer can be invoked. |
1264 | */ | 1395 | */ |
@@ -1269,6 +1400,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1269 | struct mem_cgroup *mem, *mem_over_limit; | 1400 | struct mem_cgroup *mem, *mem_over_limit; |
1270 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1401 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1271 | struct res_counter *fail_res; | 1402 | struct res_counter *fail_res; |
1403 | int csize = CHARGE_SIZE; | ||
1272 | 1404 | ||
1273 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1405 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { |
1274 | /* Don't account this! */ | 1406 | /* Don't account this! */ |
@@ -1293,23 +1425,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1293 | return 0; | 1425 | return 0; |
1294 | 1426 | ||
1295 | VM_BUG_ON(css_is_removed(&mem->css)); | 1427 | VM_BUG_ON(css_is_removed(&mem->css)); |
1428 | if (mem_cgroup_is_root(mem)) | ||
1429 | goto done; | ||
1296 | 1430 | ||
1297 | while (1) { | 1431 | while (1) { |
1298 | int ret = 0; | 1432 | int ret = 0; |
1299 | unsigned long flags = 0; | 1433 | unsigned long flags = 0; |
1300 | 1434 | ||
1301 | if (mem_cgroup_is_root(mem)) | 1435 | if (consume_stock(mem)) |
1302 | goto done; | 1436 | goto charged; |
1303 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | 1437 | |
1438 | ret = res_counter_charge(&mem->res, csize, &fail_res); | ||
1304 | if (likely(!ret)) { | 1439 | if (likely(!ret)) { |
1305 | if (!do_swap_account) | 1440 | if (!do_swap_account) |
1306 | break; | 1441 | break; |
1307 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, | 1442 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); |
1308 | &fail_res); | ||
1309 | if (likely(!ret)) | 1443 | if (likely(!ret)) |
1310 | break; | 1444 | break; |
1311 | /* mem+swap counter fails */ | 1445 | /* mem+swap counter fails */ |
1312 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1446 | res_counter_uncharge(&mem->res, csize); |
1313 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 1447 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1314 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1448 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1315 | memsw); | 1449 | memsw); |
@@ -1318,6 +1452,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1318 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1452 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1319 | res); | 1453 | res); |
1320 | 1454 | ||
1455 | /* reduce request size and retry */ | ||
1456 | if (csize > PAGE_SIZE) { | ||
1457 | csize = PAGE_SIZE; | ||
1458 | continue; | ||
1459 | } | ||
1321 | if (!(gfp_mask & __GFP_WAIT)) | 1460 | if (!(gfp_mask & __GFP_WAIT)) |
1322 | goto nomem; | 1461 | goto nomem; |
1323 | 1462 | ||
@@ -1347,6 +1486,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1347 | goto nomem; | 1486 | goto nomem; |
1348 | } | 1487 | } |
1349 | } | 1488 | } |
1489 | if (csize > PAGE_SIZE) | ||
1490 | refill_stock(mem, csize - PAGE_SIZE); | ||
1491 | charged: | ||
1350 | /* | 1492 | /* |
1351 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 1493 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
1352 | * if they exceeds softlimit. | 1494 | * if they exceeds softlimit. |
@@ -2469,6 +2611,7 @@ move_account: | |||
2469 | goto out; | 2611 | goto out; |
2470 | /* This is for making all *used* pages to be on LRU. */ | 2612 | /* This is for making all *used* pages to be on LRU. */ |
2471 | lru_add_drain_all(); | 2613 | lru_add_drain_all(); |
2614 | drain_all_stock_sync(); | ||
2472 | ret = 0; | 2615 | ret = 0; |
2473 | for_each_node_state(node, N_HIGH_MEMORY) { | 2616 | for_each_node_state(node, N_HIGH_MEMORY) { |
2474 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 2617 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
@@ -3183,11 +3326,18 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3183 | 3326 | ||
3184 | /* root ? */ | 3327 | /* root ? */ |
3185 | if (cont->parent == NULL) { | 3328 | if (cont->parent == NULL) { |
3329 | int cpu; | ||
3186 | enable_swap_cgroup(); | 3330 | enable_swap_cgroup(); |
3187 | parent = NULL; | 3331 | parent = NULL; |
3188 | root_mem_cgroup = mem; | 3332 | root_mem_cgroup = mem; |
3189 | if (mem_cgroup_soft_limit_tree_init()) | 3333 | if (mem_cgroup_soft_limit_tree_init()) |
3190 | goto free_out; | 3334 | goto free_out; |
3335 | for_each_possible_cpu(cpu) { | ||
3336 | struct memcg_stock_pcp *stock = | ||
3337 | &per_cpu(memcg_stock, cpu); | ||
3338 | INIT_WORK(&stock->work, drain_local_stock); | ||
3339 | } | ||
3340 | hotcpu_notifier(memcg_stock_cpu_callback, 0); | ||
3191 | 3341 | ||
3192 | } else { | 3342 | } else { |
3193 | parent = mem_cgroup_from_cont(cont->parent); | 3343 | parent = mem_cgroup_from_cont(cont->parent); |