diff options
author | Balbir Singh <balbir@linux.vnet.ibm.com> | 2009-09-23 18:56:42 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-24 10:20:59 -0400 |
commit | 0c3e73e84fe3f64cf1c2e8bb4e91e8901cbcdc38 (patch) | |
tree | 1c3e27fce3babd3703b34c9a353f253fcefb6325 | |
parent | 4e41695356fb4e0b153be1440ad027e46e0a7ea2 (diff) |
memcg: improve resource counter scalability
Reduce the resource counter overhead (mostly spinlock) associated with the
root cgroup. This is a part of the several patches to reduce mem cgroup
overhead. I had posted other approaches earlier (including using percpu
counters). Those patches will be a natural addition and will be added
iteratively on top of these.
The patch stops resource counter accounting for the root cgroup. The data
for display is derived from the statisitcs we maintain via
mem_cgroup_charge_statistics (which is more scalable). What happens today
is that, we do double accounting, once using res_counter_charge() and once
using memory_cgroup_charge_statistics(). For the root, since we don't
implement limits any more, we don't need to track every charge via
res_counter_charge() and check for limit being exceeded and reclaim.
The main mem->res usage_in_bytes can be derived by summing the cache and
rss usage data from memory statistics (MEM_CGROUP_STAT_RSS and
MEM_CGROUP_STAT_CACHE). However, for memsw->res usage_in_bytes, we need
additional data about swapped out memory. This patch adds a
MEM_CGROUP_STAT_SWAPOUT and uses that along with MEM_CGROUP_STAT_RSS and
MEM_CGROUP_STAT_CACHE to derive the memsw data. This data is computed
recursively when hierarchy is enabled.
The tests results I see on a 24 way show that
1. The lock contention disappears from /proc/lock_stats
2. The results of the test are comparable to running with
cgroup_disable=memory.
Here is a sample of my program runs
Without Patch
Performance counter stats for '/home/balbir/parallel_pagefault':
7192804.124144 task-clock-msecs # 23.937 CPUs
424691 context-switches # 0.000 M/sec
267 CPU-migrations # 0.000 M/sec
28498113 page-faults # 0.004 M/sec
5826093739340 cycles # 809.989 M/sec
408883496292 instructions # 0.070 IPC
7057079452 cache-references # 0.981 M/sec
3036086243 cache-misses # 0.422 M/sec
300.485365680 seconds time elapsed
With cgroup_disable=memory
Performance counter stats for '/home/balbir/parallel_pagefault':
7182183.546587 task-clock-msecs # 23.915 CPUs
425458 context-switches # 0.000 M/sec
203 CPU-migrations # 0.000 M/sec
92545093 page-faults # 0.013 M/sec
6034363609986 cycles # 840.185 M/sec
437204346785 instructions # 0.072 IPC
6636073192 cache-references # 0.924 M/sec
2358117732 cache-misses # 0.328 M/sec
300.320905827 seconds time elapsed
With this patch applied
Performance counter stats for '/home/balbir/parallel_pagefault':
7191619.223977 task-clock-msecs # 23.955 CPUs
422579 context-switches # 0.000 M/sec
88 CPU-migrations # 0.000 M/sec
91946060 page-faults # 0.013 M/sec
5957054385619 cycles # 828.333 M/sec
1058117350365 instructions # 0.178 IPC
9161776218 cache-references # 1.274 M/sec
1920494280 cache-misses # 0.267 M/sec
300.218764862 seconds time elapsed
Data from Prarit (kernel compile with make -j64 on a 64
CPU/32G machine)
For a single run
Without patch
real 27m8.988s
user 87m24.916s
sys 382m6.037s
With patch
real 4m18.607s
user 84m58.943s
sys 50m52.682s
With config turned off
real 4m54.972s
user 90m13.456s
sys 50m19.711s
NOTE: The data looks counterintuitive due to the increased performance
with the patch, even over the config being turned off. We probably need
more runs, but so far all testing has shown that the patches definitely
help.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/memcontrol.c | 121 |
1 files changed, 100 insertions, 21 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 011aba6cad70..2011f15278fd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -70,6 +70,7 @@ enum mem_cgroup_stat_index { | |||
70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | 72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ |
73 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | ||
73 | 74 | ||
74 | MEM_CGROUP_STAT_NSTATS, | 75 | MEM_CGROUP_STAT_NSTATS, |
75 | }; | 76 | }; |
@@ -478,11 +479,24 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | |||
478 | return mz; | 479 | return mz; |
479 | } | 480 | } |
480 | 481 | ||
482 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | ||
483 | bool charge) | ||
484 | { | ||
485 | int val = (charge) ? 1 : -1; | ||
486 | struct mem_cgroup_stat *stat = &mem->stat; | ||
487 | struct mem_cgroup_stat_cpu *cpustat; | ||
488 | int cpu = get_cpu(); | ||
489 | |||
490 | cpustat = &stat->cpustat[cpu]; | ||
491 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); | ||
492 | put_cpu(); | ||
493 | } | ||
494 | |||
481 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 495 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
482 | struct page_cgroup *pc, | 496 | struct page_cgroup *pc, |
483 | bool charge) | 497 | bool charge) |
484 | { | 498 | { |
485 | int val = (charge)? 1 : -1; | 499 | int val = (charge) ? 1 : -1; |
486 | struct mem_cgroup_stat *stat = &mem->stat; | 500 | struct mem_cgroup_stat *stat = &mem->stat; |
487 | struct mem_cgroup_stat_cpu *cpustat; | 501 | struct mem_cgroup_stat_cpu *cpustat; |
488 | int cpu = get_cpu(); | 502 | int cpu = get_cpu(); |
@@ -1285,9 +1299,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1285 | VM_BUG_ON(css_is_removed(&mem->css)); | 1299 | VM_BUG_ON(css_is_removed(&mem->css)); |
1286 | 1300 | ||
1287 | while (1) { | 1301 | while (1) { |
1288 | int ret; | 1302 | int ret = 0; |
1289 | unsigned long flags = 0; | 1303 | unsigned long flags = 0; |
1290 | 1304 | ||
1305 | if (mem_cgroup_is_root(mem)) | ||
1306 | goto done; | ||
1291 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res, | 1307 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res, |
1292 | &soft_fail_res); | 1308 | &soft_fail_res); |
1293 | if (likely(!ret)) { | 1309 | if (likely(!ret)) { |
@@ -1347,6 +1363,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1347 | if (mem_cgroup_soft_limit_check(mem_over_soft_limit)) | 1363 | if (mem_cgroup_soft_limit_check(mem_over_soft_limit)) |
1348 | mem_cgroup_update_tree(mem_over_soft_limit, page); | 1364 | mem_cgroup_update_tree(mem_over_soft_limit, page); |
1349 | } | 1365 | } |
1366 | done: | ||
1350 | return 0; | 1367 | return 0; |
1351 | nomem: | 1368 | nomem: |
1352 | css_put(&mem->css); | 1369 | css_put(&mem->css); |
@@ -1419,9 +1436,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1419 | lock_page_cgroup(pc); | 1436 | lock_page_cgroup(pc); |
1420 | if (unlikely(PageCgroupUsed(pc))) { | 1437 | if (unlikely(PageCgroupUsed(pc))) { |
1421 | unlock_page_cgroup(pc); | 1438 | unlock_page_cgroup(pc); |
1422 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); | 1439 | if (!mem_cgroup_is_root(mem)) { |
1423 | if (do_swap_account) | 1440 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); |
1424 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); | 1441 | if (do_swap_account) |
1442 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, | ||
1443 | NULL); | ||
1444 | } | ||
1425 | css_put(&mem->css); | 1445 | css_put(&mem->css); |
1426 | return; | 1446 | return; |
1427 | } | 1447 | } |
@@ -1498,7 +1518,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1498 | if (pc->mem_cgroup != from) | 1518 | if (pc->mem_cgroup != from) |
1499 | goto out; | 1519 | goto out; |
1500 | 1520 | ||
1501 | res_counter_uncharge(&from->res, PAGE_SIZE, NULL); | 1521 | if (!mem_cgroup_is_root(from)) |
1522 | res_counter_uncharge(&from->res, PAGE_SIZE, NULL); | ||
1502 | mem_cgroup_charge_statistics(from, pc, false); | 1523 | mem_cgroup_charge_statistics(from, pc, false); |
1503 | 1524 | ||
1504 | page = pc->page; | 1525 | page = pc->page; |
@@ -1517,7 +1538,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1517 | 1); | 1538 | 1); |
1518 | } | 1539 | } |
1519 | 1540 | ||
1520 | if (do_swap_account) | 1541 | if (do_swap_account && !mem_cgroup_is_root(from)) |
1521 | res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL); | 1542 | res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL); |
1522 | css_put(&from->css); | 1543 | css_put(&from->css); |
1523 | 1544 | ||
@@ -1588,9 +1609,11 @@ uncharge: | |||
1588 | /* drop extra refcnt by try_charge() */ | 1609 | /* drop extra refcnt by try_charge() */ |
1589 | css_put(&parent->css); | 1610 | css_put(&parent->css); |
1590 | /* uncharge if move fails */ | 1611 | /* uncharge if move fails */ |
1591 | res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); | 1612 | if (!mem_cgroup_is_root(parent)) { |
1592 | if (do_swap_account) | 1613 | res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); |
1593 | res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); | 1614 | if (do_swap_account) |
1615 | res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); | ||
1616 | } | ||
1594 | return ret; | 1617 | return ret; |
1595 | } | 1618 | } |
1596 | 1619 | ||
@@ -1779,7 +1802,10 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
1779 | * This recorded memcg can be obsolete one. So, avoid | 1802 | * This recorded memcg can be obsolete one. So, avoid |
1780 | * calling css_tryget | 1803 | * calling css_tryget |
1781 | */ | 1804 | */ |
1782 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); | 1805 | if (!mem_cgroup_is_root(memcg)) |
1806 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE, | ||
1807 | NULL); | ||
1808 | mem_cgroup_swap_statistics(memcg, false); | ||
1783 | mem_cgroup_put(memcg); | 1809 | mem_cgroup_put(memcg); |
1784 | } | 1810 | } |
1785 | rcu_read_unlock(); | 1811 | rcu_read_unlock(); |
@@ -1804,9 +1830,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
1804 | return; | 1830 | return; |
1805 | if (!mem) | 1831 | if (!mem) |
1806 | return; | 1832 | return; |
1807 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); | 1833 | if (!mem_cgroup_is_root(mem)) { |
1808 | if (do_swap_account) | 1834 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); |
1809 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); | 1835 | if (do_swap_account) |
1836 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); | ||
1837 | } | ||
1810 | css_put(&mem->css); | 1838 | css_put(&mem->css); |
1811 | } | 1839 | } |
1812 | 1840 | ||
@@ -1859,9 +1887,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1859 | break; | 1887 | break; |
1860 | } | 1888 | } |
1861 | 1889 | ||
1862 | res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); | 1890 | if (!mem_cgroup_is_root(mem)) { |
1863 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | 1891 | res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); |
1864 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); | 1892 | if (do_swap_account && |
1893 | (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
1894 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); | ||
1895 | } | ||
1896 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
1897 | mem_cgroup_swap_statistics(mem, true); | ||
1865 | mem_cgroup_charge_statistics(mem, pc, false); | 1898 | mem_cgroup_charge_statistics(mem, pc, false); |
1866 | 1899 | ||
1867 | ClearPageCgroupUsed(pc); | 1900 | ClearPageCgroupUsed(pc); |
@@ -1952,7 +1985,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
1952 | * We uncharge this because swap is freed. | 1985 | * We uncharge this because swap is freed. |
1953 | * This memcg can be obsolete one. We avoid calling css_tryget | 1986 | * This memcg can be obsolete one. We avoid calling css_tryget |
1954 | */ | 1987 | */ |
1955 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); | 1988 | if (!mem_cgroup_is_root(memcg)) |
1989 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); | ||
1990 | mem_cgroup_swap_statistics(memcg, false); | ||
1956 | mem_cgroup_put(memcg); | 1991 | mem_cgroup_put(memcg); |
1957 | } | 1992 | } |
1958 | rcu_read_unlock(); | 1993 | rcu_read_unlock(); |
@@ -2464,20 +2499,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
2464 | return retval; | 2499 | return retval; |
2465 | } | 2500 | } |
2466 | 2501 | ||
2502 | struct mem_cgroup_idx_data { | ||
2503 | s64 val; | ||
2504 | enum mem_cgroup_stat_index idx; | ||
2505 | }; | ||
2506 | |||
2507 | static int | ||
2508 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) | ||
2509 | { | ||
2510 | struct mem_cgroup_idx_data *d = data; | ||
2511 | d->val += mem_cgroup_read_stat(&mem->stat, d->idx); | ||
2512 | return 0; | ||
2513 | } | ||
2514 | |||
2515 | static void | ||
2516 | mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, | ||
2517 | enum mem_cgroup_stat_index idx, s64 *val) | ||
2518 | { | ||
2519 | struct mem_cgroup_idx_data d; | ||
2520 | d.idx = idx; | ||
2521 | d.val = 0; | ||
2522 | mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); | ||
2523 | *val = d.val; | ||
2524 | } | ||
2525 | |||
2467 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 2526 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
2468 | { | 2527 | { |
2469 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 2528 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
2470 | u64 val = 0; | 2529 | u64 idx_val, val; |
2471 | int type, name; | 2530 | int type, name; |
2472 | 2531 | ||
2473 | type = MEMFILE_TYPE(cft->private); | 2532 | type = MEMFILE_TYPE(cft->private); |
2474 | name = MEMFILE_ATTR(cft->private); | 2533 | name = MEMFILE_ATTR(cft->private); |
2475 | switch (type) { | 2534 | switch (type) { |
2476 | case _MEM: | 2535 | case _MEM: |
2477 | val = res_counter_read_u64(&mem->res, name); | 2536 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { |
2537 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2538 | MEM_CGROUP_STAT_CACHE, &idx_val); | ||
2539 | val = idx_val; | ||
2540 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2541 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2542 | val += idx_val; | ||
2543 | val <<= PAGE_SHIFT; | ||
2544 | } else | ||
2545 | val = res_counter_read_u64(&mem->res, name); | ||
2478 | break; | 2546 | break; |
2479 | case _MEMSWAP: | 2547 | case _MEMSWAP: |
2480 | val = res_counter_read_u64(&mem->memsw, name); | 2548 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { |
2549 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2550 | MEM_CGROUP_STAT_CACHE, &idx_val); | ||
2551 | val = idx_val; | ||
2552 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2553 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2554 | val += idx_val; | ||
2555 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2556 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
2557 | val <<= PAGE_SHIFT; | ||
2558 | } else | ||
2559 | val = res_counter_read_u64(&mem->memsw, name); | ||
2481 | break; | 2560 | break; |
2482 | default: | 2561 | default: |
2483 | BUG(); | 2562 | BUG(); |